diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 1f417dbada8e6..ef1d99df52bf0 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -6317,10 +6317,13 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder, // Specialize for cmpxchg // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated + SyncScope::ID SSID = cast(Inst)->getSyncScopeID(); if (isReleaseOrStronger(Ord)) - return Ord == AtomicOrdering::SequentiallyConsistent - ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent) - : Builder.CreateFence(AtomicOrdering::Release); + return Builder.CreateFence( + Ord == AtomicOrdering::SequentiallyConsistent + ? AtomicOrdering::SequentiallyConsistent + : AtomicOrdering::Release, + SSID); return nullptr; } @@ -6332,15 +6335,15 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder, if (!isa(Inst)) return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord); + auto *CI = cast(Inst); auto CASWidth = - cast( - dyn_cast(Inst)->getCompareOperand()->getType()) - ->getBitWidth(); + cast(CI->getCompareOperand()->getType())->getBitWidth(); + SyncScope::ID SSID = CI->getSyncScopeID(); // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated if (isAcquireOrStronger(Ord) && (Ord != AtomicOrdering::SequentiallyConsistent || CASWidth < STI.getMinCmpXchgSizeInBits())) - return Builder.CreateFence(AtomicOrdering::Acquire); + return Builder.CreateFence(AtomicOrdering::Acquire, SSID); return nullptr; } diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 193418ca391e5..4dbcf6183efe9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -41,6 +41,27 @@ def AS_match { }]; } +multiclass nvvm_ternary_atomic_op_scoped { + defvar frag_pat = (frag node:$ptr, node:$cmp, node:$val); + def NAME#_cta: PatFrag(NAME) node:$ptr, node:$cmp, node:$val), [{ + return Scopes[cast(N)->getSyncScopeID()] == NVPTX::Scope::Block; + }]>; + def NAME#_cluster : PatFrag(NAME) node:$ptr, node:$cmp, node:$val), [{ + return Scopes[cast(N)->getSyncScopeID()] == NVPTX::Scope::Cluster; + }]>; + def NAME#_gpu: PatFrag(NAME) node:$ptr, node:$cmp, node:$val), [{ + return Scopes[cast(N)->getSyncScopeID()] == NVPTX::Scope::Device; + }]>; + def NAME#_sys: PatFrag(NAME) node:$ptr, node:$cmp, node:$val), [{ + return Scopes[cast(N)->getSyncScopeID()] == NVPTX::Scope::System; + }]>; +} + + // A node that will be replaced with the current PTX version. class PTX { SDNodeXForm PTXVerXform = SDNodeXForm preds> { - defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;"; +multiclass F_ATOMIC_3 preds> { + defvar asm_str = "atom" # sem_str # scope_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;"; let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { def rr : NVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.RC:$b, t.RC:$c), @@ -2149,12 +2170,12 @@ multiclass F_ATOMIC_2_AS, preds>; } -multiclass F_ATOMIC_3_AS preds = []> { +multiclass F_ATOMIC_3_AS preds = []> { defvar frag_pat = (frag node:$a, node:$b, node:$c); - defm _G : F_ATOMIC_3, preds>; - defm _S : F_ATOMIC_3, preds>; - defm _S_C : F_ATOMIC_3, !listconcat([hasClusters], preds)>; - defm _GEN : F_ATOMIC_3, preds>; + defm _G : F_ATOMIC_3, preds>; + defm _S : F_ATOMIC_3, preds>; + defm _S_C : F_ATOMIC_3, !listconcat([hasClusters], preds)>; + defm _GEN : F_ATOMIC_3, preds>; } // atom_add @@ -2205,18 +2226,30 @@ foreach t = [I32RT, I64RT] in { foreach order = ["acquire", "release", "acq_rel", "monotonic"] in { defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order); defvar atomic_cmp_swap_pat = !cast("atomic_cmp_swap_i"#t.Size#_#order); + + // Instantiate scoped versions of the atomic compare and swap pattern + defm atomic_cmp_swap_i#t.Size#_#order: nvvm_ternary_atomic_op_scoped; + + foreach scope = ["cta", "cluster", "gpu", "sys"] in { + defvar atomic_cmp_swap_pat_scoped = !cast("atomic_cmp_swap_i"#t.Size#_#order#_#scope); + + // Syncscope is only supported for SM70+ + defm INT_PTX_ATOM_CAS_#t.Size#_#order#_#scope + : F_ATOMIC_3_AS, hasPTX<63>]>; + } + // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it. // Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions- // for SM70+, and "old" ones which lower to "atom.cas", for earlier archs. defm INT_PTX_ATOM_CAS_#t.Size#_#order - : F_ATOMIC_3_AS, hasPTX<63>]>; + : F_ATOMIC_3_AS, hasPTX<63>]>; defm INT_PTX_ATOM_CAS_#t.Size#_#order#_old - : F_ATOMIC_3_AS; + : F_ATOMIC_3_AS; } } // Note that 16-bit CAS support in PTX is emulated. -defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS, hasPTX<63>]>; +defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS, hasPTX<63>]>; // Support for scoped atomic operations. Matches // int_nvvm_atomic_{op}_{space}_{type}_{scope} @@ -2246,7 +2279,8 @@ multiclass ATOM3N_impl Preds> { defm "" : F_ATOMIC_3( "int_nvvm_atomic_" # OpStr diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll index b5a4f94611453..54dfe2eb2bca5 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll @@ -71,7 +71,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2; ; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3; ; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30; -; CHECKPTX71-NEXT: atom.relaxed.cas.b32 %r6, [%r1], %r54, %r32; +; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r6, [%r1], %r54, %r32; ; CHECKPTX71-NEXT: setp.ne.s32 %p1, %r6, %r54; ; CHECKPTX71-NEXT: mov.b32 %r54, %r6; ; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1; @@ -87,7 +87,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2; ; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3; ; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35; -; CHECKPTX71-NEXT: atom.relaxed.cas.b32 %r9, [%r1], %r55, %r37; +; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r9, [%r1], %r55, %r37; ; CHECKPTX71-NEXT: setp.ne.s32 %p2, %r9, %r55; ; CHECKPTX71-NEXT: mov.b32 %r55, %r9; ; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3; @@ -109,7 +109,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11; ; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12; ; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43; -; CHECKPTX71-NEXT: atom.relaxed.global.cas.b32 %r15, [%r10], %r56, %r45; +; CHECKPTX71-NEXT: atom.relaxed.sys.global.cas.b32 %r15, [%r10], %r56, %r45; ; CHECKPTX71-NEXT: setp.ne.s32 %p3, %r15, %r56; ; CHECKPTX71-NEXT: mov.b32 %r56, %r15; ; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5; @@ -131,7 +131,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17; ; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18; ; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51; -; CHECKPTX71-NEXT: atom.relaxed.shared.cas.b32 %r21, [%r16], %r57, %r53; +; CHECKPTX71-NEXT: atom.relaxed.sys.shared.cas.b32 %r21, [%r16], %r57, %r53; ; CHECKPTX71-NEXT: setp.ne.s32 %p4, %r21, %r57; ; CHECKPTX71-NEXT: mov.b32 %r57, %r21; ; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll index 65a077d67e4ba..d8e676c76d547 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll @@ -47,8 +47,8 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ret i8 %new } -define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_global( +define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -68,15 +68,15 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB1_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -88,12 +88,12 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_shared( +define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -113,15 +113,15 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB2_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -133,12 +133,12 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_generic( +define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -146,8 +146,8 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -158,7 +158,7 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -176,15 +176,14 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB3_1; ; SM60-NEXT: $L__BB3_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_global( +define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -192,8 +191,8 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -204,7 +203,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -222,15 +221,14 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB4_1; ; SM60-NEXT: $L__BB4_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_shared( +define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -238,8 +236,8 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -250,15 +248,15 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB5_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -268,15 +266,14 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB5_1; ; SM60-NEXT: $L__BB5_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_generic( +define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -284,9 +281,8 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -297,15 +293,15 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB6_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -315,15 +311,14 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB6_1; ; SM60-NEXT: $L__BB6_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_global( +define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -331,9 +326,8 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -344,7 +338,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -362,15 +356,14 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB7_1; ; SM60-NEXT: $L__BB7_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_shared( +define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -378,9 +371,8 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -391,7 +383,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -409,15 +401,14 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB8_1; ; SM60-NEXT: $L__BB8_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_generic( +define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -425,8 +416,8 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -437,15 +428,15 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB9_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -455,15 +446,14 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB9_1; ; SM60-NEXT: $L__BB9_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_global( +define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -471,8 +461,8 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -483,15 +473,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB10_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -501,15 +491,14 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB10_1; ; SM60-NEXT: $L__BB10_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_shared( +define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -517,8 +506,8 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -529,7 +518,7 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -547,15 +536,14 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB11_1; ; SM60-NEXT: $L__BB11_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_generic( +define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -563,8 +551,8 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -575,7 +563,7 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -596,12 +584,12 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new } -define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_global( +define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -609,8 +597,8 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -621,15 +609,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB13_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -642,12 +630,12 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_shared( +define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -655,8 +643,8 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -667,15 +655,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB14_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -685,15 +673,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB14_1; ; SM60-NEXT: $L__BB14_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_generic( +define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -701,9 +689,8 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -714,7 +701,7 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -732,15 +719,15 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB15_1; ; SM60-NEXT: $L__BB15_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_global( +define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -748,9 +735,8 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -761,7 +747,7 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -782,12 +768,12 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_shared( +define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -795,9 +781,8 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -808,15 +793,15 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB17_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -829,12 +814,12 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_generic( +define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -842,9 +827,8 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -855,15 +839,15 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB18_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -873,14 +857,15 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB18_1; ; SM60-NEXT: $L__BB18_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_global( +define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -888,9 +873,8 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -901,7 +885,7 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -919,14 +903,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB19_1; ; SM60-NEXT: $L__BB19_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_shared( +define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -934,9 +919,8 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -947,7 +931,7 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -965,14 +949,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB20_1; ; SM60-NEXT: $L__BB20_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new } -define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_generic( +define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -980,9 +965,8 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -993,15 +977,15 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB21_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1014,12 +998,12 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_global( +define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1027,9 +1011,8 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1040,15 +1023,15 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB22_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1058,15 +1041,15 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB22_1; ; SM60-NEXT: $L__BB22_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_shared( +define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1074,9 +1057,8 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1087,7 +1069,7 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1105,15 +1087,15 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB23_1; ; SM60-NEXT: $L__BB23_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_generic( +define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1121,8 +1103,8 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1134,7 +1116,7 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1155,12 +1137,12 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_global( +define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1168,8 +1150,8 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1181,15 +1163,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB25_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1202,12 +1184,12 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_shared( +define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1215,9 +1197,9 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1228,15 +1210,15 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB26_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1246,15 +1228,15 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB26_1; ; SM60-NEXT: $L__BB26_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_generic( +define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1262,9 +1244,9 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1275,7 +1257,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1293,15 +1275,15 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB27_1; ; SM60-NEXT: $L__BB27_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_global( +define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1309,8 +1291,8 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1322,7 +1304,7 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1343,12 +1325,12 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_shared( +define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1356,8 +1338,8 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1369,15 +1351,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB29_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1390,12 +1372,12 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_generic( +define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1403,9 +1385,9 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1416,15 +1398,15 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB30_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB30_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1434,15 +1416,15 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB30_1; ; SM60-NEXT: $L__BB30_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_global( +define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1450,9 +1432,9 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1463,7 +1445,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1481,15 +1463,15 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB31_1; ; SM60-NEXT: $L__BB31_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_shared( +define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1497,8 +1479,8 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1510,7 +1492,7 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1531,12 +1513,12 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_generic( +define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1544,8 +1526,8 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1557,15 +1539,15 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB33_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB33_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1578,12 +1560,12 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_global( +define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1591,9 +1573,9 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1604,15 +1586,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB34_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB34_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1622,15 +1604,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB34_1; ; SM60-NEXT: $L__BB34_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_shared( +define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1638,9 +1620,9 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1651,7 +1633,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1669,15 +1651,15 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB35_1; ; SM60-NEXT: $L__BB35_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_generic( +define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1685,9 +1667,8 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1698,7 +1679,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1719,12 +1700,12 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_global( +define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1732,9 +1713,8 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1745,15 +1725,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB37_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB37_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1766,12 +1746,12 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_shared( +define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1779,9 +1759,8 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1792,15 +1771,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB38_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB38_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1810,15 +1789,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB38_1; ; SM60-NEXT: $L__BB38_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_generic( +define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1826,9 +1805,8 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1839,7 +1817,7 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1857,15 +1835,15 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB39_1; ; SM60-NEXT: $L__BB39_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_global( +define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1873,9 +1851,8 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1886,7 +1863,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1907,12 +1884,12 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_shared( +define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1920,9 +1897,8 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1933,15 +1909,15 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB41_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB41_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1954,12 +1930,12 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_generic( +define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1967,9 +1943,8 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1980,15 +1955,15 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB42_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB42_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1998,15 +1973,15 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB42_1; ; SM60-NEXT: $L__BB42_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_global( +define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2014,9 +1989,8 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2027,7 +2001,7 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -2045,15 +2019,15 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB43_1; ; SM60-NEXT: $L__BB43_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_shared( +define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2061,9 +2035,8 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2074,7 +2047,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -2095,3586 +2068,20641 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new } -define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_generic( +define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB45_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB45_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB45_1; ; SM60-NEXT: $L__BB45_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + ret i8 %new } -define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_global( +define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB46_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB46_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB46_1; ; SM60-NEXT: $L__BB46_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + ret i8 %new } -define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_shared( +define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB47_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB47_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB47_1; ; SM60-NEXT: $L__BB47_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + ret i8 %new } -define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_generic( +define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB48_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB48_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB48_1; ; SM60-NEXT: $L__BB48_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new } -define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_global( +define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB49_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB49_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB49_1; ; SM60-NEXT: $L__BB49_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + ret i8 %new } -define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_shared( +define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB50_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB50_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB50_1; ; SM60-NEXT: $L__BB50_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + ret i8 %new } -define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_generic( +define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB51_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB51_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB51_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB51_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB51_1; ; SM60-NEXT: $L__BB51_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + ret i8 %new } -define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_global( +define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB52_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB52_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB52_1; ; SM60-NEXT: $L__BB52_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new } -define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_shared( +define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB53_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB53_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB53_1; ; SM60-NEXT: $L__BB53_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + ret i8 %new } -define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_generic( +define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB54_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB54_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB54_1; ; SM60-NEXT: $L__BB54_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + ret i8 %new } -define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_global( +define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB55_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB55_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB55_1; ; SM60-NEXT: $L__BB55_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + ret i8 %new } -define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_shared( +define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB56_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB56_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB56_1; ; SM60-NEXT: $L__BB56_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new } -define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_generic( +define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB57_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB57_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB57_1; ; SM60-NEXT: $L__BB57_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + ret i8 %new } -define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_global( +define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB58_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB58_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB58_1; ; SM60-NEXT: $L__BB58_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + ret i8 %new } -define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_shared( +define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB59_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB59_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB59_1; ; SM60-NEXT: $L__BB59_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + ret i8 %new } -define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_generic( +define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB60_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB60_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB60_1; ; SM60-NEXT: $L__BB60_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new } -define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_global( +define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB61_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB61_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB61_1; ; SM60-NEXT: $L__BB61_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_shared( +define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB62_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB62_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB62_1; ; SM60-NEXT: $L__BB62_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_generic( +define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB63_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB63_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB63_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB63_1; ; SM60-NEXT: $L__BB63_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_global( +define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB64_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB64_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB64_1; ; SM60-NEXT: $L__BB64_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new } -define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_shared( +define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB65_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB65_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB65_1; ; SM60-NEXT: $L__BB65_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_generic( +define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB66_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB66_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB66_1; ; SM60-NEXT: $L__BB66_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_global( +define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB67_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB67_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB67_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB67_1; ; SM60-NEXT: $L__BB67_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_shared( +define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB68_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB68_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB68_1; ; SM60-NEXT: $L__BB68_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new } -define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_generic( +define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB69_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB69_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB69_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB69_1; ; SM60-NEXT: $L__BB69_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_global( +define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB70_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB70_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB70_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB70_1; ; SM60-NEXT: $L__BB70_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_shared( +define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB71_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB71_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB71_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB71_1; ; SM60-NEXT: $L__BB71_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_generic( +define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB72_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB72_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB72_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB72_1; ; SM60-NEXT: $L__BB72_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new } -define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_global( +define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB73_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB73_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB73_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB73_1; ; SM60-NEXT: $L__BB73_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_shared( +define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB74_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB74_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB74_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB74_1; ; SM60-NEXT: $L__BB74_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_generic( +define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB75_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB75_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB75_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB75_1; ; SM60-NEXT: $L__BB75_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_global( +define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB76_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB76_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB76_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB76_1; ; SM60-NEXT: $L__BB76_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new } -define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_shared( +define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB77_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB77_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB77_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB77_1; ; SM60-NEXT: $L__BB77_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_generic( +define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB78_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB78_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB78_1; ; SM60-NEXT: $L__BB78_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_global( +define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB79_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB79_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB79_1; ; SM60-NEXT: $L__BB79_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_shared( +define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB80_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB80_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB80_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB80_1; ; SM60-NEXT: $L__BB80_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new } -define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_generic( +define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB81_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB81_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB81_1; ; SM60-NEXT: $L__BB81_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_global( +define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB82_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB82_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB82_1; ; SM60-NEXT: $L__BB82_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_shared( +define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB83_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB83_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB83_1; ; SM60-NEXT: $L__BB83_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_generic( +define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB84_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB84_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB84_1; ; SM60-NEXT: $L__BB84_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire + ret i8 %new } -define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_global( +define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB85_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB85_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB85_1; ; SM60-NEXT: $L__BB85_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new } -define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_shared( +define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB86_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB86_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB86_1; ; SM60-NEXT: $L__BB86_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_generic( +define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB87_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB87_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB87_1; ; SM60-NEXT: $L__BB87_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_global( +define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB88_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB88_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB88_1; ; SM60-NEXT: $L__BB88_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB89_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB89_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB89_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB89_1; +; SM60-NEXT: $L__BB89_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB90_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB90_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB90_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB90_1; +; SM60-NEXT: $L__BB90_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB91_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB91_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB91_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB91_1; +; SM60-NEXT: $L__BB91_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB92_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB92_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB92_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB92_1; +; SM60-NEXT: $L__BB92_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB93_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB93_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB93_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB93_1; +; SM60-NEXT: $L__BB93_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB94_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB94_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB94_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB94_1; +; SM60-NEXT: $L__BB94_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB95_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB95_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB95_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB95_1; +; SM60-NEXT: $L__BB95_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB96_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB96_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB96_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB96_1; +; SM60-NEXT: $L__BB96_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB97_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB97_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB97_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB97_1; +; SM60-NEXT: $L__BB97_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB98_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB98_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB98_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB98_1; +; SM60-NEXT: $L__BB98_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB99_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB99_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB99_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB99_1; +; SM60-NEXT: $L__BB99_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB100_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB100_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB100_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB100_1; +; SM60-NEXT: $L__BB100_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB101_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB101_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB101_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB101_1; +; SM60-NEXT: $L__BB101_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB102_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB102_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB102_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB102_1; +; SM60-NEXT: $L__BB102_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB103_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB103_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB103_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB103_1; +; SM60-NEXT: $L__BB103_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB104_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB104_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB104_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB104_1; +; SM60-NEXT: $L__BB104_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB105_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB105_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB105_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB105_1; +; SM60-NEXT: $L__BB105_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB106_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB106_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB106_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB106_1; +; SM60-NEXT: $L__BB106_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB107_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB107_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB107_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB107_1; +; SM60-NEXT: $L__BB107_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB108_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB108_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB108_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB108_1; +; SM60-NEXT: $L__BB108_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB109_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB109_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB109_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB109_1; +; SM60-NEXT: $L__BB109_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB110_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB110_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB110_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB110_1; +; SM60-NEXT: $L__BB110_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB111_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB111_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB111_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB111_1; +; SM60-NEXT: $L__BB111_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB112_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB112_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB112_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB112_1; +; SM60-NEXT: $L__BB112_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB113_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB113_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB113_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB113_1; +; SM60-NEXT: $L__BB113_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB114_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB114_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB114_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB114_1; +; SM60-NEXT: $L__BB114_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB115_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB115_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB115_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB115_1; +; SM60-NEXT: $L__BB115_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB116_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB116_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB116_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB116_1; +; SM60-NEXT: $L__BB116_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB117_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB117_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB117_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB117_1; +; SM60-NEXT: $L__BB117_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB118_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB118_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB118_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB118_1; +; SM60-NEXT: $L__BB118_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB119_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB119_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB119_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB119_1; +; SM60-NEXT: $L__BB119_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB120_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB120_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB120_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB120_1; +; SM60-NEXT: $L__BB120_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB121_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB121_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB121_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB121_1; +; SM60-NEXT: $L__BB121_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB122_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB122_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB122_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB122_1; +; SM60-NEXT: $L__BB122_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB123_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB123_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB123_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB123_1; +; SM60-NEXT: $L__BB123_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB124_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB124_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB124_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB124_1; +; SM60-NEXT: $L__BB124_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB125_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB125_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB125_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB125_1; +; SM60-NEXT: $L__BB125_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB126_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB126_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB126_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB126_1; +; SM60-NEXT: $L__BB126_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB127_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB127_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB127_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB127_1; +; SM60-NEXT: $L__BB127_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB128_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB128_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB128_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB128_1; +; SM60-NEXT: $L__BB128_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB129_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB129_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB129_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB129_1; +; SM60-NEXT: $L__BB129_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB130_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB130_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB130_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB130_1; +; SM60-NEXT: $L__BB130_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB131_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB131_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB131_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB131_1; +; SM60-NEXT: $L__BB131_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB132_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB132_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB132_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB132_1; +; SM60-NEXT: $L__BB132_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB133_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB133_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB133_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB133_1; +; SM60-NEXT: $L__BB133_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB134_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB134_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB134_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB134_1; +; SM60-NEXT: $L__BB134_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB135_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB135_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB135_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB135_1; +; SM60-NEXT: $L__BB135_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB136_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB136_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB136_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB136_1; +; SM60-NEXT: $L__BB136_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB137_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB137_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB137_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB137_1; +; SM60-NEXT: $L__BB137_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB138_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB138_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB138_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB138_1; +; SM60-NEXT: $L__BB138_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB139_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB139_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB139_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB139_1; +; SM60-NEXT: $L__BB139_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB140_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB140_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB140_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB140_1; +; SM60-NEXT: $L__BB140_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB141_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB141_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB141_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB141_1; +; SM60-NEXT: $L__BB141_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB142_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB142_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB142_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB142_1; +; SM60-NEXT: $L__BB142_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB143_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB143_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB143_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB143_1; +; SM60-NEXT: $L__BB143_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB144_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB144_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB144_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB144_1; +; SM60-NEXT: $L__BB144_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB145_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB145_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB145_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB145_1; +; SM60-NEXT: $L__BB145_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB146_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB146_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB146_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB146_1; +; SM60-NEXT: $L__BB146_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB147_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB147_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB147_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB147_1; +; SM60-NEXT: $L__BB147_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB148_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB148_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB148_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB148_1; +; SM60-NEXT: $L__BB148_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB149_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB149_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB149_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB149_1; +; SM60-NEXT: $L__BB149_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB150_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB150_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB150_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB150_1; +; SM60-NEXT: $L__BB150_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB151_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB151_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB151_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB151_1; +; SM60-NEXT: $L__BB151_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB152_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB152_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB152_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB152_1; +; SM60-NEXT: $L__BB152_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB153_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB153_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB153_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB153_1; +; SM60-NEXT: $L__BB153_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB154_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB154_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB154_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB154_1; +; SM60-NEXT: $L__BB154_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB155_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB155_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB155_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB155_1; +; SM60-NEXT: $L__BB155_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB156_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB156_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB156_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB156_1; +; SM60-NEXT: $L__BB156_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB157_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB157_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB157_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB157_1; +; SM60-NEXT: $L__BB157_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB158_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB158_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB158_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB158_1; +; SM60-NEXT: $L__BB158_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB159_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB159_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB159_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB159_1; +; SM60-NEXT: $L__BB159_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB160_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB160_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB160_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB160_1; +; SM60-NEXT: $L__BB160_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB161_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB161_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB161_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB161_1; +; SM60-NEXT: $L__BB161_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB162_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB162_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB162_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB162_1; +; SM60-NEXT: $L__BB162_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB163_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB163_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB163_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB163_1; +; SM60-NEXT: $L__BB163_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB164_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB164_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB164_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB164_1; +; SM60-NEXT: $L__BB164_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB165_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB165_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB165_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB165_1; +; SM60-NEXT: $L__BB165_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB166_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB166_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB166_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB166_1; +; SM60-NEXT: $L__BB166_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB167_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB167_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB167_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB167_1; +; SM60-NEXT: $L__BB167_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB168_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB168_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB168_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB168_1; +; SM60-NEXT: $L__BB168_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB169_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB169_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB169_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB169_1; +; SM60-NEXT: $L__BB169_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB170_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB170_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB170_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB170_1; +; SM60-NEXT: $L__BB170_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB171_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB171_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB171_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB171_1; +; SM60-NEXT: $L__BB171_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB172_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB172_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB172_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB172_1; +; SM60-NEXT: $L__BB172_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB173_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB173_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB173_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB173_1; +; SM60-NEXT: $L__BB173_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB174_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB174_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB174_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB174_1; +; SM60-NEXT: $L__BB174_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB175_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB175_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB175_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB175_1; +; SM60-NEXT: $L__BB175_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB176_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB176_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB176_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB176_1; +; SM60-NEXT: $L__BB176_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB177_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB177_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB177_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB177_1; +; SM60-NEXT: $L__BB177_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB178_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB178_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB178_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB178_1; +; SM60-NEXT: $L__BB178_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB179_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB179_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB179_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB179_1; +; SM60-NEXT: $L__BB179_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB180_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB180_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB180_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB180_1; +; SM60-NEXT: $L__BB180_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB181_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB181_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB181_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB181_1; +; SM60-NEXT: $L__BB181_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB182_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB182_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB182_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB182_1; +; SM60-NEXT: $L__BB182_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB183_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB183_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB183_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB183_1; +; SM60-NEXT: $L__BB183_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB184_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB184_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB184_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB184_1; +; SM60-NEXT: $L__BB184_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB185_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB185_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB185_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB185_1; +; SM60-NEXT: $L__BB185_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB186_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB186_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB186_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB186_1; +; SM60-NEXT: $L__BB186_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB187_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB187_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB187_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB187_1; +; SM60-NEXT: $L__BB187_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB188_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB188_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB188_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB188_1; +; SM60-NEXT: $L__BB188_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB189_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB189_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB189_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB189_1; +; SM60-NEXT: $L__BB189_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB190_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB190_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB190_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB190_1; +; SM60-NEXT: $L__BB190_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB191_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB191_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB191_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB191_1; +; SM60-NEXT: $L__BB191_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB192_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB192_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB192_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB192_1; +; SM60-NEXT: $L__BB192_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB193_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB193_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB193_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB193_1; +; SM60-NEXT: $L__BB193_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB194_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB194_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB194_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB194_1; +; SM60-NEXT: $L__BB194_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB195_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB195_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB195_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB195_1; +; SM60-NEXT: $L__BB195_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB196_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB196_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB196_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB196_1; +; SM60-NEXT: $L__BB196_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB197_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB197_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB197_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB197_1; +; SM60-NEXT: $L__BB197_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB198_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB198_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB198_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB198_1; +; SM60-NEXT: $L__BB198_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB199_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB199_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB199_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB199_1; +; SM60-NEXT: $L__BB199_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB200_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB200_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB200_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB200_1; +; SM60-NEXT: $L__BB200_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB201_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB201_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB201_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB201_1; +; SM60-NEXT: $L__BB201_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB202_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB202_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB202_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB202_1; +; SM60-NEXT: $L__BB202_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB203_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB203_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB203_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB203_1; +; SM60-NEXT: $L__BB203_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB204_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB204_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB204_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB204_1; +; SM60-NEXT: $L__BB204_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB205_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB205_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB205_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB205_1; +; SM60-NEXT: $L__BB205_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB206_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB206_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB206_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB206_1; +; SM60-NEXT: $L__BB206_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB207_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB207_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB207_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB207_1; +; SM60-NEXT: $L__BB207_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB208_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB208_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB208_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB208_1; +; SM60-NEXT: $L__BB208_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB209_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB209_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB209_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB209_1; +; SM60-NEXT: $L__BB209_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB210_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB210_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB210_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB210_1; +; SM60-NEXT: $L__BB210_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB211_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB211_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB211_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB211_1; +; SM60-NEXT: $L__BB211_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB212_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB212_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB212_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB212_1; +; SM60-NEXT: $L__BB212_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB213_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB213_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB213_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB213_1; +; SM60-NEXT: $L__BB213_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB214_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB214_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB214_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB214_1; +; SM60-NEXT: $L__BB214_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB215_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB215_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB215_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB215_1; +; SM60-NEXT: $L__BB215_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB216_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB216_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB216_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB216_1; +; SM60-NEXT: $L__BB216_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB217_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB217_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB217_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB217_1; +; SM60-NEXT: $L__BB217_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB218_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB218_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB218_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB218_1; +; SM60-NEXT: $L__BB218_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB219_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB219_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB219_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB219_1; +; SM60-NEXT: $L__BB219_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB220_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB220_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB220_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB220_1; +; SM60-NEXT: $L__BB220_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB221_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB221_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB221_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB221_1; +; SM60-NEXT: $L__BB221_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB222_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB222_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB222_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB222_1; +; SM60-NEXT: $L__BB222_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB223_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB223_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB223_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB223_1; +; SM60-NEXT: $L__BB223_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB224_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB224_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB224_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB224_1; +; SM60-NEXT: $L__BB224_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB225_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB225_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB225_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB225_1; +; SM60-NEXT: $L__BB225_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB226_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB226_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB226_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB226_1; +; SM60-NEXT: $L__BB226_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB227_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB227_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB227_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB227_1; +; SM60-NEXT: $L__BB227_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB228_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB228_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB228_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB228_1; +; SM60-NEXT: $L__BB228_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB229_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB229_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB229_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB229_1; +; SM60-NEXT: $L__BB229_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB230_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB230_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB230_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB230_1; +; SM60-NEXT: $L__BB230_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB231_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB231_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB231_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB231_1; +; SM60-NEXT: $L__BB231_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB232_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB232_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB232_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB232_1; +; SM60-NEXT: $L__BB232_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB233_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB233_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB233_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB233_1; +; SM60-NEXT: $L__BB233_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB234_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB234_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB234_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB234_1; +; SM60-NEXT: $L__BB234_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB235_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB235_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB235_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB235_1; +; SM60-NEXT: $L__BB235_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB236_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB236_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB236_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB236_1; +; SM60-NEXT: $L__BB236_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB237_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB237_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB237_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB237_1; +; SM60-NEXT: $L__BB237_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB238_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB238_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB238_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB238_1; +; SM60-NEXT: $L__BB238_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB239_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB239_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB239_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB239_1; +; SM60-NEXT: $L__BB239_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB240_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB240_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB240_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB240_1; +; SM60-NEXT: $L__BB240_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB241_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB241_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB241_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB241_1; +; SM60-NEXT: $L__BB241_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB242_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB242_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB242_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB242_1; +; SM60-NEXT: $L__BB242_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB243_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB243_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB243_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB243_1; +; SM60-NEXT: $L__BB243_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB244_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB244_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB244_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB244_1; +; SM60-NEXT: $L__BB244_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB245_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB245_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB245_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB245_1; +; SM60-NEXT: $L__BB245_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB246_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB246_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB246_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB246_1; +; SM60-NEXT: $L__BB246_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB247_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB247_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB247_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB247_1; +; SM60-NEXT: $L__BB247_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB248_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB248_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB248_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB248_1; +; SM60-NEXT: $L__BB248_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB249_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB249_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB249_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB249_1; +; SM60-NEXT: $L__BB249_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB250_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB250_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB250_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB250_1; +; SM60-NEXT: $L__BB250_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB251_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB251_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB251_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB251_1; +; SM60-NEXT: $L__BB251_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB252_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB252_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB252_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB252_1; +; SM60-NEXT: $L__BB252_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB253_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB253_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB253_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB253_1; +; SM60-NEXT: $L__BB253_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB254_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB254_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB254_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB254_1; +; SM60-NEXT: $L__BB254_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB255_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB255_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB255_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB255_1; +; SM60-NEXT: $L__BB255_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB256_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB256_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB256_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB256_1; +; SM60-NEXT: $L__BB256_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB257_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB257_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB257_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB257_1; +; SM60-NEXT: $L__BB257_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB258_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB258_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB258_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB258_1; +; SM60-NEXT: $L__BB258_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB259_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB259_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB259_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB259_1; +; SM60-NEXT: $L__BB259_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB260_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB260_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB260_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB260_1; +; SM60-NEXT: $L__BB260_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB261_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB261_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB261_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB261_1; +; SM60-NEXT: $L__BB261_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB262_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB262_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB262_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB262_1; +; SM60-NEXT: $L__BB262_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB263_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB263_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB263_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB263_1; +; SM60-NEXT: $L__BB263_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB264_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB264_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB264_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB264_1; +; SM60-NEXT: $L__BB264_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB265_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB265_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB265_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB265_1; +; SM60-NEXT: $L__BB265_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB266_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB266_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB266_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB266_1; +; SM60-NEXT: $L__BB266_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB267_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB267_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB267_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB267_1; +; SM60-NEXT: $L__BB267_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB268_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB268_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB268_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB268_1; +; SM60-NEXT: $L__BB268_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB269_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB269_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB269_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB269_1; +; SM60-NEXT: $L__BB269_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB270_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB270_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB270_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB270_1; +; SM60-NEXT: $L__BB270_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB271_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB271_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB271_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB271_1; +; SM60-NEXT: $L__BB271_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB272_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB272_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB272_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB272_1; +; SM60-NEXT: $L__BB272_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB273_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB273_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB273_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB273_1; +; SM60-NEXT: $L__BB273_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB274_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB274_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB274_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB274_1; +; SM60-NEXT: $L__BB274_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB275_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB275_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB275_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB275_1; +; SM60-NEXT: $L__BB275_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB276_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB276_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB276_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB276_1; +; SM60-NEXT: $L__BB276_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB277_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB277_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB277_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB277_1; +; SM60-NEXT: $L__BB277_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB278_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB278_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB278_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB278_1; +; SM60-NEXT: $L__BB278_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB279_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB279_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB279_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB279_1; +; SM60-NEXT: $L__BB279_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB280_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB280_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB280_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB280_1; +; SM60-NEXT: $L__BB280_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB281_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB281_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB281_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB281_1; +; SM60-NEXT: $L__BB281_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB282_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB282_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB282_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB282_1; +; SM60-NEXT: $L__BB282_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB283_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB283_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB283_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB283_1; +; SM60-NEXT: $L__BB283_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB284_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB284_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB284_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB284_1; +; SM60-NEXT: $L__BB284_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB285_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB285_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB285_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB285_1; +; SM60-NEXT: $L__BB285_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB286_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB286_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB286_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB286_1; +; SM60-NEXT: $L__BB286_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB287_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB287_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB287_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB287_1; +; SM60-NEXT: $L__BB287_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB288_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB288_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB288_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB288_1; +; SM60-NEXT: $L__BB288_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB289_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB289_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB289_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB289_1; +; SM60-NEXT: $L__BB289_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB290_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB290_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB290_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB290_1; +; SM60-NEXT: $L__BB290_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB291_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB291_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB291_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB291_1; +; SM60-NEXT: $L__BB291_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB292_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB292_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB292_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB292_1; +; SM60-NEXT: $L__BB292_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB293_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB293_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB293_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB293_1; +; SM60-NEXT: $L__BB293_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB294_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB294_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB294_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB294_1; +; SM60-NEXT: $L__BB294_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB295_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB295_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB295_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB295_1; +; SM60-NEXT: $L__BB295_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB296_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB296_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB296_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB296_1; +; SM60-NEXT: $L__BB296_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB297_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB297_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB297_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB297_1; +; SM60-NEXT: $L__BB297_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB298_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB298_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB298_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB298_1; +; SM60-NEXT: $L__BB298_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB299_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB299_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB299_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB299_1; +; SM60-NEXT: $L__BB299_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB300_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB300_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB300_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB300_1; +; SM60-NEXT: $L__BB300_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB301_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB301_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB301_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB301_1; +; SM60-NEXT: $L__BB301_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB302_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB302_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB302_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB302_1; +; SM60-NEXT: $L__BB302_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB303_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB303_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB303_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB303_1; +; SM60-NEXT: $L__BB303_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB304_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB304_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB304_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB304_1; +; SM60-NEXT: $L__BB304_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB305_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB305_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB305_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB305_1; +; SM60-NEXT: $L__BB305_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB306_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB306_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB306_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB306_1; +; SM60-NEXT: $L__BB306_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB307_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB307_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB307_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB307_1; +; SM60-NEXT: $L__BB307_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB308_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB308_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB308_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB308_1; +; SM60-NEXT: $L__BB308_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB309_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB309_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB309_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB309_1; +; SM60-NEXT: $L__BB309_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB310_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB310_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB310_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB310_1; +; SM60-NEXT: $L__BB310_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB311_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB311_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB311_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB311_1; +; SM60-NEXT: $L__BB311_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB312_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB312_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB312_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB312_1; +; SM60-NEXT: $L__BB312_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB313_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB313_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB313_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB313_1; +; SM60-NEXT: $L__BB313_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB314_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB314_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB314_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB314_1; +; SM60-NEXT: $L__BB314_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB315_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB315_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB315_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB315_1; +; SM60-NEXT: $L__BB315_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB316_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB316_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB316_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB316_1; +; SM60-NEXT: $L__BB316_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB317_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB317_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB317_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB317_1; +; SM60-NEXT: $L__BB317_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB318_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB318_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB318_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB318_1; +; SM60-NEXT: $L__BB318_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB319_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB319_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB319_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB319_1; +; SM60-NEXT: $L__BB319_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB320_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB320_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB320_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB320_1; +; SM60-NEXT: $L__BB320_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB321_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB321_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB321_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB321_1; +; SM60-NEXT: $L__BB321_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB322_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB322_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB322_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB322_1; +; SM60-NEXT: $L__BB322_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB323_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB323_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB323_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB323_1; +; SM60-NEXT: $L__BB323_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB324_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB324_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB324_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB324_1; +; SM60-NEXT: $L__BB324_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB325_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB325_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB325_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB325_1; +; SM60-NEXT: $L__BB325_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB326_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB326_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB326_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB326_1; +; SM60-NEXT: $L__BB326_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB327_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB327_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB327_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB327_1; +; SM60-NEXT: $L__BB327_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB328_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB328_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB328_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB328_1; +; SM60-NEXT: $L__BB328_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB329_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB329_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB329_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB329_1; +; SM60-NEXT: $L__BB329_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB330_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB330_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB330_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB330_1; +; SM60-NEXT: $L__BB330_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB331_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB331_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB331_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB331_1; +; SM60-NEXT: $L__BB331_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB332_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB332_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB332_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB332_1; +; SM60-NEXT: $L__BB332_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB333_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB333_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB333_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB333_1; +; SM60-NEXT: $L__BB333_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB334_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB334_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB334_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB334_1; +; SM60-NEXT: $L__BB334_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB335_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB335_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB335_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB335_1; +; SM60-NEXT: $L__BB335_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB336_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB336_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB336_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB336_1; +; SM60-NEXT: $L__BB336_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB337_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB337_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB337_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB337_1; +; SM60-NEXT: $L__BB337_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB338_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB338_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB338_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB338_1; +; SM60-NEXT: $L__BB338_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB339_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB339_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB339_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB339_1; +; SM60-NEXT: $L__BB339_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB340_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB340_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB340_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB340_1; +; SM60-NEXT: $L__BB340_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB341_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB341_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB341_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB341_1; +; SM60-NEXT: $L__BB341_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB342_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB342_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB342_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB342_1; +; SM60-NEXT: $L__BB342_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB343_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB343_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB343_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB343_1; +; SM60-NEXT: $L__BB343_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB344_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB344_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB344_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB344_1; +; SM60-NEXT: $L__BB344_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB345_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB345_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB345_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB345_1; +; SM60-NEXT: $L__BB345_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB346_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB346_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB346_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB346_1; +; SM60-NEXT: $L__BB346_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB347_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB347_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB347_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB347_1; +; SM60-NEXT: $L__BB347_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB348_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB348_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB348_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB348_1; +; SM60-NEXT: $L__BB348_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB349_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB349_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB349_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB349_1; +; SM60-NEXT: $L__BB349_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB350_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB350_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB350_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB350_1; +; SM60-NEXT: $L__BB350_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB351_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB351_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB351_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB351_1; +; SM60-NEXT: $L__BB351_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB352_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB352_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB352_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB352_1; +; SM60-NEXT: $L__BB352_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB353_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB353_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB353_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB353_1; +; SM60-NEXT: $L__BB353_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB354_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB354_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB354_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB354_1; +; SM60-NEXT: $L__BB354_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB355_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB355_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB355_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB355_1; +; SM60-NEXT: $L__BB355_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB356_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB356_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB356_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB356_1; +; SM60-NEXT: $L__BB356_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB357_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB357_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB357_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB357_1; +; SM60-NEXT: $L__BB357_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB358_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB358_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB358_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB358_1; +; SM60-NEXT: $L__BB358_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB359_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB359_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB359_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB359_1; +; SM60-NEXT: $L__BB359_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_global_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_global_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_global_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_global_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_global_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_global_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_global_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_global_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_global_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_global_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_global_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_global_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new +} + +define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_global_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_global_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new } -define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_shared( +define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_generic( ; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB89_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB89_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB89_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB89_1; -; SM60-NEXT: $L__BB89_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire + ret i64 %new } -define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_generic( +define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_generic_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new } -define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_global( +define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_generic_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new } -define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_shared( +define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_generic_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new } -define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_generic( +define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_global( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new } -define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_global( +define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_global_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new } -define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_shared( +define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_global_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new } -define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_generic( +define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_global_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new } -define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_global( +define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_shared( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new +} + +define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new } -define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_shared( +define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_generic_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_generic( +define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_generic_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new } -define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_global( +define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_global( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new } -define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_shared( +define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_global_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new } -define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_generic( +define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_global_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_global( +define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_global_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new } -define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_shared( +define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_shared( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new } -define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_generic( +define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_shared_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new } -define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_global( +define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_shared_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_shared( +define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_shared_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new } -define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_generic( +define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_generic( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new } -define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_global( +define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_generic_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_shared( +define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_generic_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_generic( +define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_generic_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_global( +define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_global( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new } -define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_shared( +define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_global_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_generic( +define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_global_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_global( +define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_global_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_shared( +define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_shared( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new } -define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_generic( +define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_shared_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_global( +define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_shared_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_shared( +define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_shared_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_generic( +define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_generic( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new } -define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_global( +define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_generic_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire + ret i64 %new } -define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_shared( +define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_generic_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_generic( +define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_generic_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_global( +define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_global( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_shared( +define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_global_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_generic( +define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_global_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_global( +define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_global_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_shared( +define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_shared( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new } -define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_generic( +define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_shared_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_global( +define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_shared_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_shared( +define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_shared_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_generic( +define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_generic( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_global( +define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_generic_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_shared( +define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_generic_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst + ret i64 %new } -define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_generic( +define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_global( +define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_global( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst ret i64 %new } -define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_shared( +define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_generic( +define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_global( +define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_global_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_shared( +define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_shared( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst ret i64 %new } -define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_generic( +define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_global( +define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_shared_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_shared( +define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_generic( +define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_generic( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic ret i64 %new } -define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_global( +define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_shared( +define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_generic_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_generic( +define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_global( +define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_global( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic ret i64 %new } -define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_shared( +define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_generic( +define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } - -define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_global( + +define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_global_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_shared( +define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_shared( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic ret i64 %new } -define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_generic( +define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_global( +define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_shared_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_shared( +define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_generic( +define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_generic( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire ret i64 %new } -define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_global( +define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_shared( +define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_generic_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_generic( +define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_global( +define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_global( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire ret i64 %new } -define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_shared( +define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_generic( +define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_global( +define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_global_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_shared( +define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_shared( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_generic( +define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_global( +define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_shared_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_shared( +define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_generic( +define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_generic( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst ret i64 %new } -define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_global( +define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_shared( +define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_generic_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_generic( +define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_global( +define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_global( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_shared( +define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_generic( +define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_global( +define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_global_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_shared( +define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_shared( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_generic( +define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_global( +define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_shared_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_shared( +define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll index 7107fbcf6eb54..2d70eb80cafdf 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll @@ -31,7 +31,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB0_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -47,8 +47,8 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ret i8 %new } -define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_global( +define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -68,15 +68,15 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB1_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -88,12 +88,12 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_shared( +define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -113,15 +113,15 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB2_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -133,12 +133,12 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_generic( +define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -146,8 +146,8 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -158,7 +158,7 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -166,7 +166,7 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB3_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -176,15 +176,14 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB3_1; ; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_global( +define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_global( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -192,8 +191,8 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -204,7 +203,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -212,7 +211,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB4_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -222,15 +221,14 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB4_1; ; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_shared( +define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -238,8 +236,8 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -250,15 +248,15 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB5_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -268,15 +266,14 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB5_1; ; SM70-NEXT: $L__BB5_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_generic( +define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -284,9 +281,8 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -297,15 +293,15 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB6_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -315,15 +311,14 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB6_1; ; SM70-NEXT: $L__BB6_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_global( +define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -331,9 +326,8 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -344,7 +338,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -352,7 +346,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB7_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -362,15 +356,14 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB7_1; ; SM70-NEXT: $L__BB7_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_shared( +define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_shared( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -378,9 +371,8 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -391,7 +383,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -399,7 +391,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB8_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -409,15 +401,14 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB8_1; ; SM70-NEXT: $L__BB8_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_generic( +define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -425,8 +416,8 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -437,15 +428,15 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB9_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -455,15 +446,14 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB9_1; ; SM70-NEXT: $L__BB9_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_global( +define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -471,8 +461,8 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -483,15 +473,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB10_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -501,15 +491,14 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB10_1; ; SM70-NEXT: $L__BB10_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_shared( +define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -517,8 +506,8 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -529,7 +518,7 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -537,7 +526,7 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB11_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -547,15 +536,14 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB11_1; ; SM70-NEXT: $L__BB11_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_generic( +define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_generic( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -563,8 +551,8 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -575,7 +563,7 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -583,7 +571,7 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB12_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -596,12 +584,12 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new } -define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_global( +define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -609,8 +597,8 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -621,15 +609,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB13_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -642,12 +630,12 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_shared( +define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -655,8 +643,8 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -667,15 +655,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB14_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -685,15 +673,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB14_1; ; SM70-NEXT: $L__BB14_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_generic( +define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -701,9 +689,8 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -714,7 +701,7 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -722,7 +709,7 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB15_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -732,15 +719,15 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB15_1; ; SM70-NEXT: $L__BB15_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_global( +define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_global( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -748,9 +735,8 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -761,7 +747,7 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -769,7 +755,7 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB16_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -782,12 +768,12 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_shared( +define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -795,9 +781,8 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -808,15 +793,15 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB17_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -829,12 +814,12 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_generic( +define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -842,9 +827,8 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -855,15 +839,15 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB18_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -873,14 +857,15 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB18_1; ; SM70-NEXT: $L__BB18_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_global( +define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -888,9 +873,8 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -901,7 +885,7 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -909,7 +893,7 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB19_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -919,14 +903,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB19_1; ; SM70-NEXT: $L__BB19_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_shared( +define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_shared( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -934,9 +919,8 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -947,7 +931,7 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -955,7 +939,7 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB20_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -965,14 +949,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB20_1; ; SM70-NEXT: $L__BB20_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new } -define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_generic( +define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -980,9 +965,8 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -993,15 +977,15 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB21_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1014,12 +998,12 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_global( +define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1027,9 +1011,8 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1040,15 +1023,15 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB22_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1058,15 +1041,15 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB22_1; ; SM70-NEXT: $L__BB22_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_shared( +define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1074,9 +1057,8 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1087,7 +1069,7 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1095,7 +1077,7 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB23_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1105,15 +1087,15 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB23_1; ; SM70-NEXT: $L__BB23_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_generic( +define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_generic( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1121,8 +1103,8 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1134,7 +1116,7 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1142,7 +1124,7 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB24_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1155,12 +1137,12 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_global( +define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1168,8 +1150,8 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1181,15 +1163,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB25_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1202,12 +1184,12 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_shared( +define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1215,9 +1197,9 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1228,15 +1210,15 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB26_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1246,15 +1228,15 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB26_1; ; SM70-NEXT: $L__BB26_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_generic( +define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1262,9 +1244,9 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1275,7 +1257,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1283,7 +1265,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB27_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1293,15 +1275,15 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB27_1; ; SM70-NEXT: $L__BB27_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_global( +define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_global( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1309,9 +1291,9 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1322,7 +1304,7 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1330,7 +1312,7 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB28_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1343,12 +1325,12 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_shared( +define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1356,9 +1338,9 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1369,15 +1351,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB29_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1390,12 +1372,12 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_generic( +define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1403,9 +1385,9 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1416,15 +1398,15 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB30_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB30_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1434,15 +1416,15 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB30_1; ; SM70-NEXT: $L__BB30_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_global( +define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1450,9 +1432,9 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1463,7 +1445,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1471,7 +1453,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB31_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1481,15 +1463,15 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB31_1; ; SM70-NEXT: $L__BB31_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_shared( +define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_shared( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1497,9 +1479,9 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1510,7 +1492,7 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1518,7 +1500,7 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB32_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1531,12 +1513,12 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_generic( +define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1544,8 +1526,8 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1557,15 +1539,15 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB33_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB33_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1578,12 +1560,12 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_global( +define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1591,9 +1573,9 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1604,15 +1586,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB34_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB34_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1622,15 +1604,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB34_1; ; SM70-NEXT: $L__BB34_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_shared( +define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1638,9 +1620,9 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1651,7 +1633,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1659,7 +1641,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB35_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1669,15 +1651,15 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB35_1; ; SM70-NEXT: $L__BB35_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_generic( +define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_generic( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1685,9 +1667,8 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1698,7 +1679,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1706,7 +1687,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB36_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1719,12 +1700,12 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_global( +define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1732,9 +1713,8 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1745,15 +1725,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB37_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB37_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1766,12 +1746,12 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_shared( +define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1779,9 +1759,8 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1792,15 +1771,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB38_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB38_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1810,15 +1789,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB38_1; ; SM70-NEXT: $L__BB38_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_generic( +define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1826,9 +1805,8 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1839,7 +1817,7 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1847,7 +1825,7 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB39_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1857,15 +1835,15 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB39_1; ; SM70-NEXT: $L__BB39_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_global( +define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_global( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1873,9 +1851,8 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1886,7 +1863,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1894,7 +1871,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB40_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1907,12 +1884,12 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_shared( +define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1920,9 +1897,8 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1933,15 +1909,15 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB41_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB41_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1954,12 +1930,12 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_generic( +define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1967,9 +1943,8 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1980,15 +1955,15 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB42_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB42_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1998,15 +1973,15 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB42_1; ; SM70-NEXT: $L__BB42_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_global( +define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2014,9 +1989,8 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2027,7 +2001,7 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -2035,7 +2009,7 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB43_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -2045,15 +2019,15 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB43_1; ; SM70-NEXT: $L__BB43_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_shared( +define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_shared( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2061,9 +2035,8 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2074,7 +2047,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -2082,7 +2055,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB44_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -2095,3586 +2068,20641 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new } -define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_generic( +define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB45_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB45_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB45_1; ; SM70-NEXT: $L__BB45_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + ret i8 %new } -define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_global( +define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB46_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB46_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB46_1; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB46_1; ; SM70-NEXT: $L__BB46_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + ret i8 %new } -define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_shared( +define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB47_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB47_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB47_1; ; SM70-NEXT: $L__BB47_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + ret i8 %new } -define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_generic( +define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_generic( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB48_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB48_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB48_1; ; SM70-NEXT: $L__BB48_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new } -define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_global( +define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB49_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB49_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB49_1; ; SM70-NEXT: $L__BB49_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + ret i8 %new } -define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_shared( +define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB50_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB50_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB50_1; ; SM70-NEXT: $L__BB50_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + ret i8 %new } -define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_generic( +define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB51_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB51_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB51_1; ; SM70-NEXT: $L__BB51_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + ret i8 %new } -define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_global( +define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_global( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB52_1: // %partword.cmpxchg.loop +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB52_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB52_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB52_1; ; SM70-NEXT: $L__BB52_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new } -define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_shared( +define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB53_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB53_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB53_1; ; SM70-NEXT: $L__BB53_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + ret i8 %new } -define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_generic( +define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB54_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB54_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB54_1; ; SM70-NEXT: $L__BB54_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + ret i8 %new } -define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_global( +define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB55_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB55_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB55_1; ; SM70-NEXT: $L__BB55_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + ret i8 %new } -define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_shared( +define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_shared( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB56_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB56_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB56_1; ; SM70-NEXT: $L__BB56_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new } -define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_generic( +define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB57_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB57_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB57_1; ; SM70-NEXT: $L__BB57_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + ret i8 %new } -define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_global( +define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB58_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB58_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB58_1; ; SM70-NEXT: $L__BB58_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + ret i8 %new } -define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_shared( +define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB59_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB59_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB59_1; ; SM70-NEXT: $L__BB59_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + ret i8 %new } -define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_generic( +define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_generic( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB60_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB60_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB60_1; ; SM70-NEXT: $L__BB60_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new } -define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_global( +define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB61_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB61_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB61_1; ; SM70-NEXT: $L__BB61_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_shared( +define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB62_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB62_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB62_1; ; SM70-NEXT: $L__BB62_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_generic( +define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB63_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB63_3; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB63_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB63_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB63_1; ; SM70-NEXT: $L__BB63_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_global( +define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_global( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB64_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB64_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB64_1; ; SM70-NEXT: $L__BB64_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new } -define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_shared( +define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB65_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB65_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB65_1; ; SM70-NEXT: $L__BB65_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_generic( +define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB66_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB66_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB66_1; ; SM70-NEXT: $L__BB66_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_global( +define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB67_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB67_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB67_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB67_1; ; SM70-NEXT: $L__BB67_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_shared( +define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_shared( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB68_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB68_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB68_1; ; SM70-NEXT: $L__BB68_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new } -define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_generic( +define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB69_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB69_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB69_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB69_1; ; SM70-NEXT: $L__BB69_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_global( +define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB70_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB70_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB70_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB70_1; ; SM70-NEXT: $L__BB70_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_shared( +define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB71_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB71_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB71_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB71_1; ; SM70-NEXT: $L__BB71_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_generic( +define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_generic( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB72_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB72_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB72_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB72_1; ; SM70-NEXT: $L__BB72_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new } -define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_global( +define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB73_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB73_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB73_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB73_1; ; SM70-NEXT: $L__BB73_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_shared( +define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB74_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB74_3; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB74_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB74_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB74_1; ; SM70-NEXT: $L__BB74_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_generic( +define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB75_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB75_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB75_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB75_1; ; SM70-NEXT: $L__BB75_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_global( +define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_global( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB76_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB76_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB76_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB76_1; ; SM70-NEXT: $L__BB76_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new } -define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_shared( +define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB77_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB77_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB77_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB77_1; ; SM70-NEXT: $L__BB77_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_generic( +define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB78_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB78_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB78_1; ; SM70-NEXT: $L__BB78_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_global( +define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB79_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB79_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB79_1; ; SM70-NEXT: $L__BB79_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_shared( +define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_shared( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB80_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB80_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB80_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB80_1; ; SM70-NEXT: $L__BB80_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new } -define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_generic( +define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB81_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB81_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB81_1; ; SM70-NEXT: $L__BB81_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_global( +define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB82_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB82_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB82_1; ; SM70-NEXT: $L__BB82_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_shared( +define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB83_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB83_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB83_1; ; SM70-NEXT: $L__BB83_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_generic( +define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_generic( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB84_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB84_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB84_1; ; SM70-NEXT: $L__BB84_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire + ret i8 %new } -define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_global( +define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB85_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB85_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB85_1; ; SM70-NEXT: $L__BB85_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new } -define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_shared( +define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB86_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB86_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB86_1; ; SM70-NEXT: $L__BB86_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_generic( +define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB87_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB87_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB87_1; ; SM70-NEXT: $L__BB87_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_global( +define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_global( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB88_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB88_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB88_1; ; SM70-NEXT: $L__BB88_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_shared( +define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB89_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB89_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB89_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB89_1; ; SM70-NEXT: $L__BB89_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new } -define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_generic( +define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_global_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB90_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB90_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB90_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB90_1; +; SM70-NEXT: $L__BB90_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB91_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB91_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB91_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB91_1; +; SM70-NEXT: $L__BB91_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB92_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB92_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB92_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB92_1; +; SM70-NEXT: $L__BB92_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB93_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB93_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB93_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB93_1; +; SM70-NEXT: $L__BB93_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB94_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB94_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB94_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB94_1; +; SM70-NEXT: $L__BB94_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB95_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB95_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB95_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB95_1; +; SM70-NEXT: $L__BB95_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB96_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB96_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB96_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB96_1; +; SM70-NEXT: $L__BB96_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB97_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB97_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB97_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB97_1; +; SM70-NEXT: $L__BB97_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB98_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB98_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB98_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB98_1; +; SM70-NEXT: $L__BB98_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB99_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB99_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB99_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB99_1; +; SM70-NEXT: $L__BB99_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB100_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB100_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB100_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB100_1; +; SM70-NEXT: $L__BB100_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB101_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB101_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB101_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB101_1; +; SM70-NEXT: $L__BB101_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB102_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB102_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB102_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB102_1; +; SM70-NEXT: $L__BB102_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB103_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB103_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB103_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB103_1; +; SM70-NEXT: $L__BB103_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB104_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB104_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB104_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB104_1; +; SM70-NEXT: $L__BB104_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB105_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB105_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB105_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB105_1; +; SM70-NEXT: $L__BB105_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB106_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB106_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB106_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB106_1; +; SM70-NEXT: $L__BB106_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB107_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB107_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB107_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB107_1; +; SM70-NEXT: $L__BB107_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB108_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB108_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB108_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB108_1; +; SM70-NEXT: $L__BB108_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB109_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB109_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB109_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB109_1; +; SM70-NEXT: $L__BB109_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB110_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB110_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB110_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB110_1; +; SM70-NEXT: $L__BB110_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB111_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB111_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB111_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB111_1; +; SM70-NEXT: $L__BB111_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB112_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB112_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB112_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB112_1; +; SM70-NEXT: $L__BB112_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB113_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB113_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB113_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB113_1; +; SM70-NEXT: $L__BB113_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB114_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB114_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB114_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB114_1; +; SM70-NEXT: $L__BB114_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB115_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB115_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB115_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB115_1; +; SM70-NEXT: $L__BB115_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB116_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB116_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB116_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB116_1; +; SM70-NEXT: $L__BB116_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB117_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB117_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB117_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB117_1; +; SM70-NEXT: $L__BB117_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB118_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB118_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB118_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB118_1; +; SM70-NEXT: $L__BB118_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB119_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB119_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB119_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB119_1; +; SM70-NEXT: $L__BB119_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB120_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB120_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB120_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB120_1; +; SM70-NEXT: $L__BB120_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB121_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB121_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB121_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB121_1; +; SM70-NEXT: $L__BB121_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB122_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB122_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB122_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB122_1; +; SM70-NEXT: $L__BB122_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB123_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB123_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB123_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB123_1; +; SM70-NEXT: $L__BB123_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB124_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB124_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB124_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB124_1; +; SM70-NEXT: $L__BB124_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB125_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB125_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB125_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB125_1; +; SM70-NEXT: $L__BB125_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB126_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB126_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB126_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB126_1; +; SM70-NEXT: $L__BB126_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB127_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB127_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB127_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB127_1; +; SM70-NEXT: $L__BB127_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB128_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB128_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB128_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB128_1; +; SM70-NEXT: $L__BB128_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB129_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB129_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB129_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB129_1; +; SM70-NEXT: $L__BB129_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB130_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB130_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB130_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB130_1; +; SM70-NEXT: $L__BB130_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB131_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB131_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB131_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB131_1; +; SM70-NEXT: $L__BB131_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB132_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB132_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB132_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB132_1; +; SM70-NEXT: $L__BB132_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB133_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB133_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB133_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB133_1; +; SM70-NEXT: $L__BB133_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB134_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB134_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB134_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB134_1; +; SM70-NEXT: $L__BB134_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB135_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB135_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB135_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB135_1; +; SM70-NEXT: $L__BB135_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB136_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB136_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB136_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB136_1; +; SM70-NEXT: $L__BB136_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB137_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB137_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB137_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB137_1; +; SM70-NEXT: $L__BB137_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB138_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB138_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB138_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB138_1; +; SM70-NEXT: $L__BB138_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB139_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB139_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB139_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB139_1; +; SM70-NEXT: $L__BB139_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB140_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB140_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB140_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB140_1; +; SM70-NEXT: $L__BB140_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB141_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB141_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB141_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB141_1; +; SM70-NEXT: $L__BB141_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB142_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB142_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB142_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB142_1; +; SM70-NEXT: $L__BB142_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB143_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB143_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB143_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB143_1; +; SM70-NEXT: $L__BB143_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB144_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB144_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB144_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB144_1; +; SM70-NEXT: $L__BB144_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB145_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB145_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB145_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB145_1; +; SM70-NEXT: $L__BB145_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB146_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB146_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB146_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB146_1; +; SM70-NEXT: $L__BB146_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB147_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB147_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB147_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB147_1; +; SM70-NEXT: $L__BB147_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB148_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB148_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB148_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB148_1; +; SM70-NEXT: $L__BB148_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB149_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB149_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB149_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB149_1; +; SM70-NEXT: $L__BB149_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB150_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB150_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB150_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB150_1; +; SM70-NEXT: $L__BB150_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB151_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB151_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB151_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB151_1; +; SM70-NEXT: $L__BB151_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB152_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB152_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB152_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB152_1; +; SM70-NEXT: $L__BB152_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB153_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB153_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB153_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB153_1; +; SM70-NEXT: $L__BB153_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB154_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB154_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB154_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB154_1; +; SM70-NEXT: $L__BB154_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB155_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB155_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB155_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB155_1; +; SM70-NEXT: $L__BB155_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB156_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB156_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB156_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB156_1; +; SM70-NEXT: $L__BB156_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB157_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB157_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB157_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB157_1; +; SM70-NEXT: $L__BB157_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB158_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB158_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB158_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB158_1; +; SM70-NEXT: $L__BB158_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB159_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB159_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB159_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB159_1; +; SM70-NEXT: $L__BB159_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB160_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB160_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB160_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB160_1; +; SM70-NEXT: $L__BB160_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB161_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB161_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB161_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB161_1; +; SM70-NEXT: $L__BB161_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB162_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB162_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB162_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB162_1; +; SM70-NEXT: $L__BB162_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB163_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB163_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB163_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB163_1; +; SM70-NEXT: $L__BB163_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB164_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB164_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB164_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB164_1; +; SM70-NEXT: $L__BB164_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB165_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB165_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB165_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB165_1; +; SM70-NEXT: $L__BB165_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB166_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB166_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB166_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB166_1; +; SM70-NEXT: $L__BB166_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB167_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB167_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB167_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB167_1; +; SM70-NEXT: $L__BB167_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB168_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB168_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB168_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB168_1; +; SM70-NEXT: $L__BB168_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB169_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB169_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB169_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB169_1; +; SM70-NEXT: $L__BB169_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB170_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB170_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB170_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB170_1; +; SM70-NEXT: $L__BB170_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB171_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB171_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB171_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB171_1; +; SM70-NEXT: $L__BB171_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB172_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB172_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB172_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB172_1; +; SM70-NEXT: $L__BB172_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB173_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB173_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB173_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB173_1; +; SM70-NEXT: $L__BB173_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB174_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB174_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB174_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB174_1; +; SM70-NEXT: $L__BB174_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB175_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB175_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB175_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB175_1; +; SM70-NEXT: $L__BB175_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB176_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB176_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB176_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB176_1; +; SM70-NEXT: $L__BB176_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB177_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB177_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB177_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB177_1; +; SM70-NEXT: $L__BB177_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB178_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB178_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB178_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB178_1; +; SM70-NEXT: $L__BB178_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB179_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB179_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB179_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB179_1; +; SM70-NEXT: $L__BB179_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB180_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB180_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB180_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB180_1; +; SM70-NEXT: $L__BB180_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB181_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB181_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB181_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB181_1; +; SM70-NEXT: $L__BB181_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB182_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB182_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB182_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB182_1; +; SM70-NEXT: $L__BB182_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB183_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB183_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB183_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB183_1; +; SM70-NEXT: $L__BB183_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB184_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB184_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB184_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB184_1; +; SM70-NEXT: $L__BB184_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB185_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB185_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB185_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB185_1; +; SM70-NEXT: $L__BB185_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB186_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB186_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB186_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB186_1; +; SM70-NEXT: $L__BB186_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB187_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB187_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB187_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB187_1; +; SM70-NEXT: $L__BB187_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB188_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB188_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB188_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB188_1; +; SM70-NEXT: $L__BB188_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB189_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB189_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB189_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB189_1; +; SM70-NEXT: $L__BB189_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB190_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB190_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB190_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB190_1; +; SM70-NEXT: $L__BB190_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB191_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB191_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB191_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB191_1; +; SM70-NEXT: $L__BB191_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB192_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB192_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB192_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB192_1; +; SM70-NEXT: $L__BB192_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB193_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB193_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB193_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB193_1; +; SM70-NEXT: $L__BB193_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB194_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB194_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB194_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB194_1; +; SM70-NEXT: $L__BB194_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB195_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB195_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB195_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB195_1; +; SM70-NEXT: $L__BB195_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB196_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB196_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB196_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB196_1; +; SM70-NEXT: $L__BB196_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB197_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB197_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB197_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB197_1; +; SM70-NEXT: $L__BB197_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB198_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB198_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB198_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB198_1; +; SM70-NEXT: $L__BB198_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB199_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB199_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB199_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB199_1; +; SM70-NEXT: $L__BB199_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB200_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB200_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB200_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB200_1; +; SM70-NEXT: $L__BB200_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB201_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB201_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB201_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB201_1; +; SM70-NEXT: $L__BB201_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB202_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB202_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB202_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB202_1; +; SM70-NEXT: $L__BB202_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB203_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB203_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB203_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB203_1; +; SM70-NEXT: $L__BB203_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB204_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB204_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB204_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB204_1; +; SM70-NEXT: $L__BB204_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB205_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB205_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB205_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB205_1; +; SM70-NEXT: $L__BB205_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB206_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB206_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB206_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB206_1; +; SM70-NEXT: $L__BB206_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB207_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB207_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB207_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB207_1; +; SM70-NEXT: $L__BB207_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB208_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB208_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB208_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB208_1; +; SM70-NEXT: $L__BB208_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB209_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB209_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB209_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB209_1; +; SM70-NEXT: $L__BB209_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB210_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB210_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB210_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB210_1; +; SM70-NEXT: $L__BB210_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB211_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB211_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB211_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB211_1; +; SM70-NEXT: $L__BB211_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB212_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB212_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB212_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB212_1; +; SM70-NEXT: $L__BB212_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB213_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB213_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB213_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB213_1; +; SM70-NEXT: $L__BB213_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB214_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB214_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB214_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB214_1; +; SM70-NEXT: $L__BB214_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB215_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB215_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB215_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB215_1; +; SM70-NEXT: $L__BB215_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB216_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB216_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB216_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB216_1; +; SM70-NEXT: $L__BB216_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB217_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB217_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB217_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB217_1; +; SM70-NEXT: $L__BB217_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB218_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB218_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB218_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB218_1; +; SM70-NEXT: $L__BB218_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB219_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB219_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB219_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB219_1; +; SM70-NEXT: $L__BB219_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB220_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB220_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB220_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB220_1; +; SM70-NEXT: $L__BB220_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB221_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB221_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB221_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB221_1; +; SM70-NEXT: $L__BB221_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB222_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB222_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB222_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB222_1; +; SM70-NEXT: $L__BB222_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB223_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB223_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB223_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB223_1; +; SM70-NEXT: $L__BB223_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB224_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB224_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB224_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB224_1; +; SM70-NEXT: $L__BB224_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB225_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB225_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB225_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB225_1; +; SM70-NEXT: $L__BB225_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB226_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB226_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB226_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB226_1; +; SM70-NEXT: $L__BB226_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB227_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB227_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB227_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB227_1; +; SM70-NEXT: $L__BB227_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB228_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB228_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB228_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB228_1; +; SM70-NEXT: $L__BB228_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB229_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB229_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB229_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB229_1; +; SM70-NEXT: $L__BB229_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB230_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB230_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB230_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB230_1; +; SM70-NEXT: $L__BB230_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB231_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB231_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB231_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB231_1; +; SM70-NEXT: $L__BB231_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB232_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB232_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB232_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB232_1; +; SM70-NEXT: $L__BB232_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB233_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB233_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB233_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB233_1; +; SM70-NEXT: $L__BB233_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB234_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB234_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB234_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB234_1; +; SM70-NEXT: $L__BB234_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB235_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB235_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB235_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB235_1; +; SM70-NEXT: $L__BB235_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB236_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB236_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB236_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB236_1; +; SM70-NEXT: $L__BB236_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB237_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB237_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB237_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB237_1; +; SM70-NEXT: $L__BB237_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB238_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB238_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB238_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB238_1; +; SM70-NEXT: $L__BB238_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB239_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB239_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB239_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB239_1; +; SM70-NEXT: $L__BB239_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB240_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB240_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB240_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB240_1; +; SM70-NEXT: $L__BB240_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB241_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB241_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB241_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB241_1; +; SM70-NEXT: $L__BB241_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB242_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB242_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB242_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB242_1; +; SM70-NEXT: $L__BB242_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB243_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB243_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB243_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB243_1; +; SM70-NEXT: $L__BB243_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB244_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB244_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB244_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB244_1; +; SM70-NEXT: $L__BB244_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB245_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB245_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB245_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB245_1; +; SM70-NEXT: $L__BB245_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB246_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB246_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB246_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB246_1; +; SM70-NEXT: $L__BB246_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB247_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB247_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB247_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB247_1; +; SM70-NEXT: $L__BB247_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB248_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB248_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB248_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB248_1; +; SM70-NEXT: $L__BB248_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB249_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB249_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB249_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB249_1; +; SM70-NEXT: $L__BB249_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB250_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB250_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB250_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB250_1; +; SM70-NEXT: $L__BB250_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB251_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB251_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB251_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB251_1; +; SM70-NEXT: $L__BB251_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB252_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB252_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB252_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB252_1; +; SM70-NEXT: $L__BB252_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB253_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB253_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB253_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB253_1; +; SM70-NEXT: $L__BB253_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB254_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB254_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB254_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB254_1; +; SM70-NEXT: $L__BB254_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB255_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB255_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB255_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB255_1; +; SM70-NEXT: $L__BB255_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB256_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB256_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB256_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB256_1; +; SM70-NEXT: $L__BB256_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB257_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB257_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB257_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB257_1; +; SM70-NEXT: $L__BB257_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB258_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB258_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB258_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB258_1; +; SM70-NEXT: $L__BB258_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB259_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB259_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB259_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB259_1; +; SM70-NEXT: $L__BB259_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB260_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB260_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB260_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB260_1; +; SM70-NEXT: $L__BB260_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB261_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB261_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB261_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB261_1; +; SM70-NEXT: $L__BB261_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB262_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB262_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB262_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB262_1; +; SM70-NEXT: $L__BB262_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB263_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB263_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB263_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB263_1; +; SM70-NEXT: $L__BB263_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB264_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB264_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB264_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB264_1; +; SM70-NEXT: $L__BB264_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB265_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB265_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB265_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB265_1; +; SM70-NEXT: $L__BB265_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB266_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB266_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB266_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB266_1; +; SM70-NEXT: $L__BB266_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB267_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB267_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB267_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB267_1; +; SM70-NEXT: $L__BB267_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB268_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB268_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB268_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB268_1; +; SM70-NEXT: $L__BB268_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB269_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB269_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB269_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB269_1; +; SM70-NEXT: $L__BB269_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB270_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB270_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB270_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB270_1; +; SM70-NEXT: $L__BB270_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB271_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB271_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB271_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB271_1; +; SM70-NEXT: $L__BB271_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB272_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB272_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB272_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB272_1; +; SM70-NEXT: $L__BB272_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB273_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB273_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB273_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB273_1; +; SM70-NEXT: $L__BB273_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB274_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB274_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB274_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB274_1; +; SM70-NEXT: $L__BB274_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB275_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB275_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB275_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB275_1; +; SM70-NEXT: $L__BB275_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB276_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB276_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB276_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB276_1; +; SM70-NEXT: $L__BB276_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB277_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB277_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB277_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB277_1; +; SM70-NEXT: $L__BB277_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB278_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB278_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB278_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB278_1; +; SM70-NEXT: $L__BB278_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB279_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB279_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB279_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB279_1; +; SM70-NEXT: $L__BB279_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB280_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB280_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB280_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB280_1; +; SM70-NEXT: $L__BB280_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB281_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB281_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB281_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB281_1; +; SM70-NEXT: $L__BB281_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB282_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB282_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB282_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB282_1; +; SM70-NEXT: $L__BB282_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB283_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB283_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB283_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB283_1; +; SM70-NEXT: $L__BB283_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB284_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB284_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB284_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB284_1; +; SM70-NEXT: $L__BB284_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB285_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB285_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB285_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB285_1; +; SM70-NEXT: $L__BB285_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB286_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB286_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB286_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB286_1; +; SM70-NEXT: $L__BB286_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB287_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB287_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB287_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB287_1; +; SM70-NEXT: $L__BB287_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB288_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB288_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB288_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB288_1; +; SM70-NEXT: $L__BB288_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB289_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB289_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB289_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB289_1; +; SM70-NEXT: $L__BB289_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB290_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB290_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB290_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB290_1; +; SM70-NEXT: $L__BB290_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB291_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB291_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB291_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB291_1; +; SM70-NEXT: $L__BB291_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB292_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB292_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB292_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB292_1; +; SM70-NEXT: $L__BB292_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB293_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB293_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB293_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB293_1; +; SM70-NEXT: $L__BB293_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB294_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB294_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB294_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB294_1; +; SM70-NEXT: $L__BB294_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB295_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB295_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB295_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB295_1; +; SM70-NEXT: $L__BB295_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB296_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB296_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB296_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB296_1; +; SM70-NEXT: $L__BB296_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB297_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB297_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB297_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB297_1; +; SM70-NEXT: $L__BB297_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB298_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB298_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB298_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB298_1; +; SM70-NEXT: $L__BB298_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB299_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB299_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB299_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB299_1; +; SM70-NEXT: $L__BB299_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB300_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB300_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB300_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB300_1; +; SM70-NEXT: $L__BB300_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB301_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB301_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB301_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB301_1; +; SM70-NEXT: $L__BB301_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB302_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB302_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB302_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB302_1; +; SM70-NEXT: $L__BB302_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB303_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB303_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB303_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB303_1; +; SM70-NEXT: $L__BB303_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB304_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB304_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB304_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB304_1; +; SM70-NEXT: $L__BB304_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB305_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB305_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB305_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB305_1; +; SM70-NEXT: $L__BB305_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB306_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB306_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB306_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB306_1; +; SM70-NEXT: $L__BB306_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB307_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB307_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB307_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB307_1; +; SM70-NEXT: $L__BB307_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB308_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB308_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB308_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB308_1; +; SM70-NEXT: $L__BB308_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB309_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB309_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB309_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB309_1; +; SM70-NEXT: $L__BB309_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB310_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB310_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB310_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB310_1; +; SM70-NEXT: $L__BB310_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB311_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB311_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB311_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB311_1; +; SM70-NEXT: $L__BB311_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB312_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB312_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB312_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB312_1; +; SM70-NEXT: $L__BB312_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB313_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB313_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB313_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB313_1; +; SM70-NEXT: $L__BB313_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB314_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB314_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB314_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB314_1; +; SM70-NEXT: $L__BB314_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB315_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB315_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB315_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB315_1; +; SM70-NEXT: $L__BB315_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB316_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB316_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB316_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB316_1; +; SM70-NEXT: $L__BB316_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB317_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB317_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB317_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB317_1; +; SM70-NEXT: $L__BB317_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB318_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB318_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB318_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB318_1; +; SM70-NEXT: $L__BB318_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB319_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB319_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB319_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB319_1; +; SM70-NEXT: $L__BB319_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB320_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB320_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB320_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB320_1; +; SM70-NEXT: $L__BB320_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB321_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB321_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB321_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB321_1; +; SM70-NEXT: $L__BB321_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB322_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB322_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB322_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB322_1; +; SM70-NEXT: $L__BB322_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB323_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB323_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB323_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB323_1; +; SM70-NEXT: $L__BB323_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB324_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB324_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB324_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB324_1; +; SM70-NEXT: $L__BB324_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB325_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB325_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB325_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB325_1; +; SM70-NEXT: $L__BB325_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB326_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB326_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB326_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB326_1; +; SM70-NEXT: $L__BB326_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB327_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB327_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB327_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB327_1; +; SM70-NEXT: $L__BB327_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB328_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB328_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB328_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB328_1; +; SM70-NEXT: $L__BB328_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB329_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB329_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB329_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB329_1; +; SM70-NEXT: $L__BB329_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB330_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB330_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB330_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB330_1; +; SM70-NEXT: $L__BB330_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB331_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB331_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB331_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB331_1; +; SM70-NEXT: $L__BB331_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB332_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB332_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB332_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB332_1; +; SM70-NEXT: $L__BB332_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB333_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB333_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB333_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB333_1; +; SM70-NEXT: $L__BB333_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB334_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB334_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB334_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB334_1; +; SM70-NEXT: $L__BB334_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB335_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB335_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB335_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB335_1; +; SM70-NEXT: $L__BB335_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB336_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB336_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB336_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB336_1; +; SM70-NEXT: $L__BB336_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB337_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB337_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB337_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB337_1; +; SM70-NEXT: $L__BB337_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB338_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB338_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB338_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB338_1; +; SM70-NEXT: $L__BB338_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB339_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB339_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB339_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB339_1; +; SM70-NEXT: $L__BB339_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB340_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB340_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB340_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB340_1; +; SM70-NEXT: $L__BB340_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB341_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB341_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB341_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB341_1; +; SM70-NEXT: $L__BB341_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB342_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB342_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB342_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB342_1; +; SM70-NEXT: $L__BB342_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB343_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB343_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB343_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB343_1; +; SM70-NEXT: $L__BB343_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB344_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB344_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB344_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB344_1; +; SM70-NEXT: $L__BB344_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB345_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB345_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB345_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB345_1; +; SM70-NEXT: $L__BB345_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB346_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB346_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB346_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB346_1; +; SM70-NEXT: $L__BB346_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB347_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB347_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB347_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB347_1; +; SM70-NEXT: $L__BB347_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB348_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB348_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB348_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB348_1; +; SM70-NEXT: $L__BB348_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB349_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB349_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB349_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB349_1; +; SM70-NEXT: $L__BB349_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB350_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB350_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB350_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB350_1; +; SM70-NEXT: $L__BB350_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB351_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB351_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB351_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB351_1; +; SM70-NEXT: $L__BB351_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB352_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB352_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB352_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB352_1; +; SM70-NEXT: $L__BB352_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB353_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB353_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB353_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB353_1; +; SM70-NEXT: $L__BB353_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB354_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB354_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB354_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB354_1; +; SM70-NEXT: $L__BB354_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB355_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB355_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB355_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB355_1; +; SM70-NEXT: $L__BB355_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB356_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB356_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB356_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB356_1; +; SM70-NEXT: $L__BB356_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB357_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB357_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB357_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB357_1; +; SM70-NEXT: $L__BB357_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB358_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB358_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB358_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB358_1; +; SM70-NEXT: $L__BB358_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB359_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB359_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB359_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB359_1; +; SM70-NEXT: $L__BB359_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2]; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2]; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2]; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_param_2]; +; SM70-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: atom.release.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.release.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_param_2]; +; SM70-NEXT: atom.release.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: atom.release.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: atom.release.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_param_2]; +; SM70-NEXT: atom.release.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: atom.release.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: atom.release.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.release.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2]; +; SM70-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: atom.relaxed.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.relaxed.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2]; +; SM70-NEXT: atom.relaxed.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_global_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: atom.relaxed.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_global_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2]; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_global_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_global_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_global_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_global_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_global_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_global_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_global_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_global_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_global_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_global_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new +} + +define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2]; +; SM70-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: atom.release.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.release.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_param_2]; +; SM70-NEXT: atom.release.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_global_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: atom.release.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_global_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: atom.release.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2]; +; SM70-NEXT: atom.release.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: atom.release.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: atom.release.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.release.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new +} + +define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_global_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_global_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new } -define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_global( +define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_global_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new } -define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_shared( +define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_shared( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new } -define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_generic( +define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_shared_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new } -define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_global( +define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_shared_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new } -define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_shared( +define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_shared_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new } -define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_generic( +define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_generic( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new } -define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_global( +define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_generic_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new } -define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_shared( +define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_generic_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_generic( +define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_generic_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new } -define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_global( +define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_global( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new } -define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_shared( +define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_global_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new } -define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_generic( +define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_global_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_global( +define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_global_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new } -define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_shared( +define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_shared( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new } -define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_generic( +define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_shared_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new } -define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_global( +define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_shared_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_shared( +define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_shared_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new } -define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_generic( +define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_generic( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new } -define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_global( +define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_generic_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_shared( +define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_generic_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_generic( +define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_generic_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_global( +define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_global( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new } -define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_shared( +define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_global_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_generic( +define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_global_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_global( +define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_global_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_shared( +define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_shared( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new } -define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_generic( +define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_shared_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_global( +define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_shared_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_shared( +define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_shared_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_generic( +define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_generic( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new } -define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_global( +define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_generic_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire + ret i64 %new } -define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_shared( +define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_generic_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_generic( +define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_generic_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_global( +define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_global( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_shared( +define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_global_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_generic( +define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_global_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_global( +define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_global_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_shared( +define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_shared( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new } -define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_generic( +define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_shared_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_global( +define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_shared_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_shared( +define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_shared_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_generic( +define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_generic( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_global( +define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_generic_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_shared( +define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_generic_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst + ret i64 %new } -define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_generic( +define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_global( +define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_global( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst ret i64 %new } -define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_shared( +define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_generic( +define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_global( +define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_global_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_shared( +define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_shared( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst ret i64 %new } -define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_generic( +define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_global( +define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_shared_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_shared( +define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_generic( +define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_generic( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic ret i64 %new } -define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_global( +define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_shared( +define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_generic_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_generic( +define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_global( +define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_global( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic ret i64 %new } -define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_shared( +define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_generic( +define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_global( +define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_global_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_shared( +define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_shared( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic ret i64 %new } -define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_generic( +define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_global( +define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_shared_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_shared( +define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_generic( +define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_generic( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire ret i64 %new } -define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_global( +define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_shared( +define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_generic_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_generic( +define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_global( +define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_global( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire ret i64 %new } -define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_shared( +define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_generic( +define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_global( +define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_global_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_shared( +define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_shared( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_generic( +define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_global( +define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_shared_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_shared( +define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_generic( +define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_generic( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst ret i64 %new } -define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_global( +define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_shared( +define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_generic_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_generic( +define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_global( +define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_global( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_shared( +define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_generic( +define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_global( +define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_global_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_shared( +define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_shared( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_seq_cst_i64_generic( +define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_seq_cst_i64_global( +define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_shared_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_seq_cst_i64_shared( +define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll index f289c3cf3d509..6f8ebcb8008bf 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll @@ -31,7 +31,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB0_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -47,8 +47,8 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ret i8 %new } -define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_global( +define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -68,15 +68,15 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB1_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -88,12 +88,12 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_shared( +define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -113,15 +113,15 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB2_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -133,12 +133,12 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_generic( +define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -146,8 +146,8 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -158,7 +158,7 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -166,7 +166,7 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB3_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -176,15 +176,14 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB3_1; ; SM90-NEXT: $L__BB3_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_global( +define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -192,8 +191,8 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -204,15 +203,15 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB4_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -222,15 +221,14 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB4_1; ; SM90-NEXT: $L__BB4_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_shared( +define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -238,8 +236,8 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -250,15 +248,15 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB5_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -268,15 +266,14 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB5_1; ; SM90-NEXT: $L__BB5_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_generic( +define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -284,9 +281,8 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -297,15 +293,15 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB6_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -315,15 +311,14 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB6_1; ; SM90-NEXT: $L__BB6_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_global( +define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -331,9 +326,8 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -344,7 +338,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -352,7 +346,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB7_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -362,15 +356,14 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB7_1; ; SM90-NEXT: $L__BB7_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_shared( +define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -378,9 +371,8 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -391,15 +383,15 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB8_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -409,15 +401,14 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB8_1; ; SM90-NEXT: $L__BB8_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_generic( +define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -425,8 +416,8 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -437,15 +428,15 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB9_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -455,15 +446,14 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB9_1; ; SM90-NEXT: $L__BB9_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_global( +define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -471,8 +461,8 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -483,15 +473,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB10_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -501,15 +491,14 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB10_1; ; SM90-NEXT: $L__BB10_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_shared( +define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -517,8 +506,8 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -529,7 +518,7 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -537,7 +526,7 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB11_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -547,15 +536,14 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB11_1; ; SM90-NEXT: $L__BB11_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_generic( +define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -563,8 +551,8 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -575,15 +563,15 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB12_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -593,15 +581,14 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB12_1; ; SM90-NEXT: $L__BB12_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_global( +define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -609,8 +596,8 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -621,15 +608,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB13_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -639,15 +626,14 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB13_1; ; SM90-NEXT: $L__BB13_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_shared( +define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -655,8 +641,8 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -667,7 +653,7 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -675,7 +661,7 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB14_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -685,15 +671,14 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB14_1; ; SM90-NEXT: $L__BB14_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_generic( +define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -701,9 +686,8 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -714,7 +698,7 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -722,7 +706,7 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB15_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -735,12 +719,12 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_global( +define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -748,9 +732,8 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -761,15 +744,15 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB16_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -782,12 +765,12 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_shared( +define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -795,9 +778,8 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -808,15 +790,15 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB17_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -826,15 +808,15 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB17_1; ; SM90-NEXT: $L__BB17_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_generic( +define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -842,9 +824,8 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -855,7 +836,7 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -863,7 +844,7 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB18_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -873,14 +854,15 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB18_1; ; SM90-NEXT: $L__BB18_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire ret i8 %new } -define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_global( +define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -888,9 +870,8 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -901,15 +882,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB19_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -919,14 +900,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB19_1; ; SM90-NEXT: $L__BB19_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_shared( +define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -934,9 +916,8 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -947,15 +928,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB20_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -965,14 +946,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB20_1; ; SM90-NEXT: $L__BB20_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new } -define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_generic( +define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -980,9 +962,8 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -993,15 +974,15 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB21_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1014,12 +995,12 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_global( +define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1027,9 +1008,8 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1040,7 +1020,7 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1048,7 +1028,7 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB22_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1058,15 +1038,15 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB22_1; ; SM90-NEXT: $L__BB22_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_shared( +define i8 @monotonic_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1074,9 +1054,8 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1087,15 +1066,15 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB23_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1105,15 +1084,15 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB23_1; ; SM90-NEXT: $L__BB23_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire ret i8 %new } -define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_generic( +define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1121,9 +1100,8 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1134,15 +1112,15 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB24_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1152,15 +1130,15 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB24_1; ; SM90-NEXT: $L__BB24_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_global( +define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1168,9 +1146,8 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1181,15 +1158,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB25_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1202,12 +1179,12 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new } -define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_shared( +define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1215,9 +1192,8 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1228,7 +1204,7 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1236,7 +1212,7 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB26_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1249,12 +1225,12 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_generic( +define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1262,9 +1238,8 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1275,15 +1250,15 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB27_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB27_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1293,15 +1268,15 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB27_1; ; SM90-NEXT: $L__BB27_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_global( +define i8 @monotonic_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1309,9 +1284,8 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1322,15 +1296,15 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB28_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1340,15 +1314,15 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB28_1; ; SM90-NEXT: $L__BB28_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_shared( +define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1356,9 +1330,8 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1369,7 +1342,7 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1377,7 +1350,7 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB29_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1387,15 +1360,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB29_1; ; SM90-NEXT: $L__BB29_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_generic( +define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1403,9 +1376,9 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1416,7 +1389,7 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1424,7 +1397,7 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB30_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1437,12 +1410,12 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_global( +define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1450,9 +1423,9 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1463,15 +1436,15 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB31_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB31_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1484,12 +1457,12 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_shared( +define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1497,9 +1470,9 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1510,15 +1483,15 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB32_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB32_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1528,15 +1501,15 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB32_1; ; SM90-NEXT: $L__BB32_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_generic( +define i8 @monotonic_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1544,9 +1517,9 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1557,7 +1530,7 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1565,7 +1538,7 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB33_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1575,15 +1548,15 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB33_1; ; SM90-NEXT: $L__BB33_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_global( +define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1591,9 +1564,9 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1604,15 +1577,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB34_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB34_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1622,15 +1595,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB34_1; ; SM90-NEXT: $L__BB34_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_shared( +define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1638,8 +1611,8 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1651,15 +1624,15 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB35_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB35_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1672,12 +1645,12 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new } -define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_generic( +define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1685,8 +1658,8 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1698,15 +1671,15 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB36_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB36_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1719,12 +1692,12 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_global( +define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1732,9 +1705,9 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1745,7 +1718,7 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1753,7 +1726,7 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB37_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1763,15 +1736,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB37_1; ; SM90-NEXT: $L__BB37_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_shared( +define i8 @monotonic_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1779,9 +1752,9 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1792,15 +1765,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB38_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB38_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1810,15 +1783,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB38_1; ; SM90-NEXT: $L__BB38_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst ret i8 %new } -define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_generic( +define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1826,9 +1799,9 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1839,15 +1812,15 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB39_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB39_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1857,15 +1830,15 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB39_1; ; SM90-NEXT: $L__BB39_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_global( +define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1873,8 +1846,8 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1886,15 +1859,15 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB40_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB40_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1907,12 +1880,12 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new } -define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_shared( +define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1920,8 +1893,8 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1933,7 +1906,7 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1941,7 +1914,7 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB41_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1954,12 +1927,12 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_generic( +define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1967,9 +1940,9 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1980,15 +1953,15 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB42_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB42_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1998,15 +1971,15 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB42_1; ; SM90-NEXT: $L__BB42_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_global( +define i8 @monotonic_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2014,9 +1987,9 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2027,15 +2000,15 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB43_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB43_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -2045,15 +2018,15 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB43_1; ; SM90-NEXT: $L__BB43_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst ret i8 %new } -define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_shared( +define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2061,9 +2034,9 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2074,7 +2047,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -2082,7 +2055,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB44_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -2092,3589 +2065,26320 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB44_1; ; SM90-NEXT: $L__BB44_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_generic( +define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB45_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB45_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB45_1; ; SM90-NEXT: $L__BB45_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic + ret i8 %new } -define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_global( +define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB46_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB46_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB46_1; ; SM90-NEXT: $L__BB46_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + ret i8 %new } -define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_shared( +define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB47_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB47_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB47_1; ; SM90-NEXT: $L__BB47_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + ret i8 %new } -define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_generic( +define i8 @acquire_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB48_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB48_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB48_1; ; SM90-NEXT: $L__BB48_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic + ret i8 %new } -define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_global( +define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB49_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB49_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB49_1; ; SM90-NEXT: $L__BB49_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + ret i8 %new } -define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_shared( +define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB50_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB50_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB50_1; ; SM90-NEXT: $L__BB50_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic + ret i8 %new } -define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_generic( +define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB51_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB51_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB51_1; ; SM90-NEXT: $L__BB51_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + ret i8 %new } -define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_global( +define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB52_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB52_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB52_1; ; SM90-NEXT: $L__BB52_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + ret i8 %new } -define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_shared( +define i8 @acquire_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB53_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB53_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB53_1; ; SM90-NEXT: $L__BB53_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic + ret i8 %new } -define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_generic( +define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB54_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB54_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB54_1; ; SM90-NEXT: $L__BB54_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + ret i8 %new } -define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_global( +define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB55_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB55_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB55_1; ; SM90-NEXT: $L__BB55_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic + ret i8 %new } -define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_shared( +define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB56_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB56_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB56_1; ; SM90-NEXT: $L__BB56_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + ret i8 %new } -define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_generic( +define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB57_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB57_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB57_1; ; SM90-NEXT: $L__BB57_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + ret i8 %new } -define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_global( +define i8 @acquire_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB58_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB58_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB58_1; ; SM90-NEXT: $L__BB58_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic + ret i8 %new } -define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_shared( +define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB59_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB59_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB59_1; ; SM90-NEXT: $L__BB59_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + ret i8 %new } -define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_generic( +define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB60_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB60_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB60_1; ; SM90-NEXT: $L__BB60_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new } -define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_global( +define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB61_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB61_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB61_1; ; SM90-NEXT: $L__BB61_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + ret i8 %new } -define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_shared( +define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB62_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB62_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB62_1; ; SM90-NEXT: $L__BB62_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + ret i8 %new } -define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_generic( +define i8 @acquire_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB63_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB63_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB63_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB63_1; ; SM90-NEXT: $L__BB63_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire + ret i8 %new } -define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_global( +define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB64_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB64_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB64_1; ; SM90-NEXT: $L__BB64_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + ret i8 %new } -define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_shared( +define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB65_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB65_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB65_1; ; SM90-NEXT: $L__BB65_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new } -define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_generic( +define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB66_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB66_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB66_1; ; SM90-NEXT: $L__BB66_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + ret i8 %new } -define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_global( +define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB67_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB67_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB67_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB67_1; ; SM90-NEXT: $L__BB67_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + ret i8 %new } -define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_shared( +define i8 @acquire_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB68_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB68_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB68_1; ; SM90-NEXT: $L__BB68_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire + ret i8 %new } -define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_generic( +define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB69_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB69_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB69_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB69_1; ; SM90-NEXT: $L__BB69_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + ret i8 %new } -define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_global( +define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB70_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB70_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB70_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB70_1; ; SM90-NEXT: $L__BB70_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new } -define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_shared( +define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB71_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB71_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB71_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB71_1; ; SM90-NEXT: $L__BB71_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + ret i8 %new } -define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_generic( +define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB72_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB72_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB72_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB72_1; ; SM90-NEXT: $L__BB72_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + ret i8 %new } -define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_global( +define i8 @acquire_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB73_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB73_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB73_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB73_1; ; SM90-NEXT: $L__BB73_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire + ret i8 %new } -define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_shared( +define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB74_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB74_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB74_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB74_1; ; SM90-NEXT: $L__BB74_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + ret i8 %new } -define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_generic( +define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB75_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB75_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB75_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB75_1; ; SM90-NEXT: $L__BB75_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new } -define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_global( +define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB76_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB76_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB76_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB76_1; ; SM90-NEXT: $L__BB76_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_shared( +define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB77_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB77_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB77_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB77_1; ; SM90-NEXT: $L__BB77_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_generic( +define i8 @acquire_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB78_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB78_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB78_1; ; SM90-NEXT: $L__BB78_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_global( +define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB79_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB79_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB79_1; ; SM90-NEXT: $L__BB79_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_shared( +define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB80_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB80_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB80_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB80_1; ; SM90-NEXT: $L__BB80_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new } -define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_generic( +define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB81_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB81_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB81_1; ; SM90-NEXT: $L__BB81_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_global( +define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB82_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB82_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB82_1; ; SM90-NEXT: $L__BB82_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_shared( +define i8 @acquire_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB83_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB83_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB83_1; ; SM90-NEXT: $L__BB83_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst + ret i8 %new } -define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_generic( +define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB84_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB84_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB84_1; ; SM90-NEXT: $L__BB84_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_global( +define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB85_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB85_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB85_1; ; SM90-NEXT: $L__BB85_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new -} + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new +} -define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_shared( +define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB86_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB86_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB86_1; ; SM90-NEXT: $L__BB86_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_generic( +define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB87_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB87_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB87_1; ; SM90-NEXT: $L__BB87_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_global( +define i8 @acquire_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB88_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB88_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB88_1; ; SM90-NEXT: $L__BB88_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_shared( +define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB89_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB89_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB89_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB89_1; ; SM90-NEXT: $L__BB89_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new +} + +define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB90_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB90_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB90_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB90_1; +; SM90-NEXT: $L__BB90_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB91_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB91_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB91_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB91_1; +; SM90-NEXT: $L__BB91_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB92_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB92_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB92_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB92_1; +; SM90-NEXT: $L__BB92_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB93_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB93_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB93_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB93_1; +; SM90-NEXT: $L__BB93_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB94_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB94_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB94_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB94_1; +; SM90-NEXT: $L__BB94_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB95_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB95_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB95_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB95_1; +; SM90-NEXT: $L__BB95_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB96_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB96_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB96_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB96_1; +; SM90-NEXT: $L__BB96_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB97_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB97_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB97_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB97_1; +; SM90-NEXT: $L__BB97_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB98_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB98_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB98_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB98_1; +; SM90-NEXT: $L__BB98_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB99_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB99_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB99_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB99_1; +; SM90-NEXT: $L__BB99_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB100_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB100_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB100_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB100_1; +; SM90-NEXT: $L__BB100_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB101_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB101_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB101_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB101_1; +; SM90-NEXT: $L__BB101_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB102_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB102_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB102_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB102_1; +; SM90-NEXT: $L__BB102_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB103_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB103_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB103_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB103_1; +; SM90-NEXT: $L__BB103_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB104_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB104_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB104_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB104_1; +; SM90-NEXT: $L__BB104_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new +} + +define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB105_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB105_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB105_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB105_1; +; SM90-NEXT: $L__BB105_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB106_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB106_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB106_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB106_1; +; SM90-NEXT: $L__BB106_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB107_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB107_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB107_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB107_1; +; SM90-NEXT: $L__BB107_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB108_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB108_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB108_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB108_1; +; SM90-NEXT: $L__BB108_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB109_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB109_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB109_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB109_1; +; SM90-NEXT: $L__BB109_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB110_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB110_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB110_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB110_1; +; SM90-NEXT: $L__BB110_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB111_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB111_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB111_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB111_1; +; SM90-NEXT: $L__BB111_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB112_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB112_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB112_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB112_1; +; SM90-NEXT: $L__BB112_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB113_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB113_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB113_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB113_1; +; SM90-NEXT: $L__BB113_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB114_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB114_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB114_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB114_1; +; SM90-NEXT: $L__BB114_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB115_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB115_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB115_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB115_1; +; SM90-NEXT: $L__BB115_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB116_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB116_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB116_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB116_1; +; SM90-NEXT: $L__BB116_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB117_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB117_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB117_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB117_1; +; SM90-NEXT: $L__BB117_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB118_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB118_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB118_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB118_1; +; SM90-NEXT: $L__BB118_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB119_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB119_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB119_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB119_1; +; SM90-NEXT: $L__BB119_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB120_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB120_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB120_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB120_1; +; SM90-NEXT: $L__BB120_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB121_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB121_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB121_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB121_1; +; SM90-NEXT: $L__BB121_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB122_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB122_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB122_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB122_1; +; SM90-NEXT: $L__BB122_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB123_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB123_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB123_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB123_1; +; SM90-NEXT: $L__BB123_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB124_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB124_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB124_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB124_1; +; SM90-NEXT: $L__BB124_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB125_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB125_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB125_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB125_1; +; SM90-NEXT: $L__BB125_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB126_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB126_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB126_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB126_1; +; SM90-NEXT: $L__BB126_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB127_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB127_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB127_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB127_1; +; SM90-NEXT: $L__BB127_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB128_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB128_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB128_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB128_1; +; SM90-NEXT: $L__BB128_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB129_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB129_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB129_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB129_1; +; SM90-NEXT: $L__BB129_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB130_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB130_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB130_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB130_1; +; SM90-NEXT: $L__BB130_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB131_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB131_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB131_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB131_1; +; SM90-NEXT: $L__BB131_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB132_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB132_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB132_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB132_1; +; SM90-NEXT: $L__BB132_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB133_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB133_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB133_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB133_1; +; SM90-NEXT: $L__BB133_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB134_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB134_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB134_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB134_1; +; SM90-NEXT: $L__BB134_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB135_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB135_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB135_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB135_1; +; SM90-NEXT: $L__BB135_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB136_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB136_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB136_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB136_1; +; SM90-NEXT: $L__BB136_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB137_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB137_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB137_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB137_1; +; SM90-NEXT: $L__BB137_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB138_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB138_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB138_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB138_1; +; SM90-NEXT: $L__BB138_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB139_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB139_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB139_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB139_1; +; SM90-NEXT: $L__BB139_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB140_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB140_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB140_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB140_1; +; SM90-NEXT: $L__BB140_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB141_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB141_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB141_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB141_1; +; SM90-NEXT: $L__BB141_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB142_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB142_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB142_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB142_1; +; SM90-NEXT: $L__BB142_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB143_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB143_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB143_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB143_1; +; SM90-NEXT: $L__BB143_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB144_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB144_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB144_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB144_1; +; SM90-NEXT: $L__BB144_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB145_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB145_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB145_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB145_1; +; SM90-NEXT: $L__BB145_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB146_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB146_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB146_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB146_1; +; SM90-NEXT: $L__BB146_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB147_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB147_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB147_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB147_1; +; SM90-NEXT: $L__BB147_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB148_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB148_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB148_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB148_1; +; SM90-NEXT: $L__BB148_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB149_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB149_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB149_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB149_1; +; SM90-NEXT: $L__BB149_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB150_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB150_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB150_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB150_1; +; SM90-NEXT: $L__BB150_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB151_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB151_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB151_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB151_1; +; SM90-NEXT: $L__BB151_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB152_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB152_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB152_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB152_1; +; SM90-NEXT: $L__BB152_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB153_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB153_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB153_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB153_1; +; SM90-NEXT: $L__BB153_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB154_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB154_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB154_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB154_1; +; SM90-NEXT: $L__BB154_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB155_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB155_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB155_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB155_1; +; SM90-NEXT: $L__BB155_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB156_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB156_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB156_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB156_1; +; SM90-NEXT: $L__BB156_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB157_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB157_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB157_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB157_1; +; SM90-NEXT: $L__BB157_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB158_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB158_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB158_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB158_1; +; SM90-NEXT: $L__BB158_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB159_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB159_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB159_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB159_1; +; SM90-NEXT: $L__BB159_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB160_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB160_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB160_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB160_1; +; SM90-NEXT: $L__BB160_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB161_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB161_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB161_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB161_1; +; SM90-NEXT: $L__BB161_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB162_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB162_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB162_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB162_1; +; SM90-NEXT: $L__BB162_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB163_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB163_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB163_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB163_1; +; SM90-NEXT: $L__BB163_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB164_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB164_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB164_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB164_1; +; SM90-NEXT: $L__BB164_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB165_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB165_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB165_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB165_1; +; SM90-NEXT: $L__BB165_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB166_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB166_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB166_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB166_1; +; SM90-NEXT: $L__BB166_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB167_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB167_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB167_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB167_1; +; SM90-NEXT: $L__BB167_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB168_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB168_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB168_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB168_1; +; SM90-NEXT: $L__BB168_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB169_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB169_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB169_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB169_1; +; SM90-NEXT: $L__BB169_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB170_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB170_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB170_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB170_1; +; SM90-NEXT: $L__BB170_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB171_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB171_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB171_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB171_1; +; SM90-NEXT: $L__BB171_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB172_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB172_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB172_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB172_1; +; SM90-NEXT: $L__BB172_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB173_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB173_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB173_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB173_1; +; SM90-NEXT: $L__BB173_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB174_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB174_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB174_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB174_1; +; SM90-NEXT: $L__BB174_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB175_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB175_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB175_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB175_1; +; SM90-NEXT: $L__BB175_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB176_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB176_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB176_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB176_1; +; SM90-NEXT: $L__BB176_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB177_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB177_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB177_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB177_1; +; SM90-NEXT: $L__BB177_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB178_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB178_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB178_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB178_1; +; SM90-NEXT: $L__BB178_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB179_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB179_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB179_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB179_1; +; SM90-NEXT: $L__BB179_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB180_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB180_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB180_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB180_1; +; SM90-NEXT: $L__BB180_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB181_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB181_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB181_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB181_1; +; SM90-NEXT: $L__BB181_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB182_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB182_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB182_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB182_1; +; SM90-NEXT: $L__BB182_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB183_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB183_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB183_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB183_1; +; SM90-NEXT: $L__BB183_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB184_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB184_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB184_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB184_1; +; SM90-NEXT: $L__BB184_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB185_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB185_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB185_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB185_1; +; SM90-NEXT: $L__BB185_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB186_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB186_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB186_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB186_1; +; SM90-NEXT: $L__BB186_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB187_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB187_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB187_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB187_1; +; SM90-NEXT: $L__BB187_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB188_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB188_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB188_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB188_1; +; SM90-NEXT: $L__BB188_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB189_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB189_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB189_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB189_1; +; SM90-NEXT: $L__BB189_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB190_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB190_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB190_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB190_1; +; SM90-NEXT: $L__BB190_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB191_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB191_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB191_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB191_1; +; SM90-NEXT: $L__BB191_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB192_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB192_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB192_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB192_1; +; SM90-NEXT: $L__BB192_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB193_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB193_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB193_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB193_1; +; SM90-NEXT: $L__BB193_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB194_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB194_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB194_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB194_1; +; SM90-NEXT: $L__BB194_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB195_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB195_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB195_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB195_1; +; SM90-NEXT: $L__BB195_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB196_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB196_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB196_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB196_1; +; SM90-NEXT: $L__BB196_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB197_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB197_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB197_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB197_1; +; SM90-NEXT: $L__BB197_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB198_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB198_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB198_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB198_1; +; SM90-NEXT: $L__BB198_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB199_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB199_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB199_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB199_1; +; SM90-NEXT: $L__BB199_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB200_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB200_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB200_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB200_1; +; SM90-NEXT: $L__BB200_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB201_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB201_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB201_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB201_1; +; SM90-NEXT: $L__BB201_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB202_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB202_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB202_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB202_1; +; SM90-NEXT: $L__BB202_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB203_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB203_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB203_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB203_1; +; SM90-NEXT: $L__BB203_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB204_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB204_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB204_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB204_1; +; SM90-NEXT: $L__BB204_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB205_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB205_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB205_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB205_1; +; SM90-NEXT: $L__BB205_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB206_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB206_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB206_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB206_1; +; SM90-NEXT: $L__BB206_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB207_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB207_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB207_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB207_1; +; SM90-NEXT: $L__BB207_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB208_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB208_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB208_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB208_1; +; SM90-NEXT: $L__BB208_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB209_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB209_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB209_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB209_1; +; SM90-NEXT: $L__BB209_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB210_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB210_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB210_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB210_1; +; SM90-NEXT: $L__BB210_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB211_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB211_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB211_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB211_1; +; SM90-NEXT: $L__BB211_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB212_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB212_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB212_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB212_1; +; SM90-NEXT: $L__BB212_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB213_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB213_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB213_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB213_1; +; SM90-NEXT: $L__BB213_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB214_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB214_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB214_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB214_1; +; SM90-NEXT: $L__BB214_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB215_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB215_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB215_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB215_1; +; SM90-NEXT: $L__BB215_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB216_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB216_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB216_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB216_1; +; SM90-NEXT: $L__BB216_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB217_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB217_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB217_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB217_1; +; SM90-NEXT: $L__BB217_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB218_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB218_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB218_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB218_1; +; SM90-NEXT: $L__BB218_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB219_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB219_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB219_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB219_1; +; SM90-NEXT: $L__BB219_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB220_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB220_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB220_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB220_1; +; SM90-NEXT: $L__BB220_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB221_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB221_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB221_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB221_1; +; SM90-NEXT: $L__BB221_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB222_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB222_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB222_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB222_1; +; SM90-NEXT: $L__BB222_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB223_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB223_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB223_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB223_1; +; SM90-NEXT: $L__BB223_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB224_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB224_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB224_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB224_1; +; SM90-NEXT: $L__BB224_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB225_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB225_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB225_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB225_1; +; SM90-NEXT: $L__BB225_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB226_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB226_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB226_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB226_1; +; SM90-NEXT: $L__BB226_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB227_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB227_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB227_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB227_1; +; SM90-NEXT: $L__BB227_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB228_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB228_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB228_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB228_1; +; SM90-NEXT: $L__BB228_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB229_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB229_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB229_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB229_1; +; SM90-NEXT: $L__BB229_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB230_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB230_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB230_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB230_1; +; SM90-NEXT: $L__BB230_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB231_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB231_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB231_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB231_1; +; SM90-NEXT: $L__BB231_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB232_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB232_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB232_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB232_1; +; SM90-NEXT: $L__BB232_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB233_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB233_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB233_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB233_1; +; SM90-NEXT: $L__BB233_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB234_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB234_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB234_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB234_1; +; SM90-NEXT: $L__BB234_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB235_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB235_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB235_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB235_1; +; SM90-NEXT: $L__BB235_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB236_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB236_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB236_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB236_1; +; SM90-NEXT: $L__BB236_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB237_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB237_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB237_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB237_1; +; SM90-NEXT: $L__BB237_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB238_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB238_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB238_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB238_1; +; SM90-NEXT: $L__BB238_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB239_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB239_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB239_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB239_1; +; SM90-NEXT: $L__BB239_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB240_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB240_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB240_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB240_1; +; SM90-NEXT: $L__BB240_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB241_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB241_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB241_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB241_1; +; SM90-NEXT: $L__BB241_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB242_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB242_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB242_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB242_1; +; SM90-NEXT: $L__BB242_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB243_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB243_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB243_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB243_1; +; SM90-NEXT: $L__BB243_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB244_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB244_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB244_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB244_1; +; SM90-NEXT: $L__BB244_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB245_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB245_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB245_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB245_1; +; SM90-NEXT: $L__BB245_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB246_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB246_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB246_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB246_1; +; SM90-NEXT: $L__BB246_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB247_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB247_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB247_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB247_1; +; SM90-NEXT: $L__BB247_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB248_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB248_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB248_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB248_1; +; SM90-NEXT: $L__BB248_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB249_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB249_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB249_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB249_1; +; SM90-NEXT: $L__BB249_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB250_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB250_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB250_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB250_1; +; SM90-NEXT: $L__BB250_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB251_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB251_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB251_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB251_1; +; SM90-NEXT: $L__BB251_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB252_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB252_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB252_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB252_1; +; SM90-NEXT: $L__BB252_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB253_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB253_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB253_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB253_1; +; SM90-NEXT: $L__BB253_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB254_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB254_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB254_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB254_1; +; SM90-NEXT: $L__BB254_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB255_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB255_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB255_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB255_1; +; SM90-NEXT: $L__BB255_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB256_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB256_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB256_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB256_1; +; SM90-NEXT: $L__BB256_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB257_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB257_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB257_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB257_1; +; SM90-NEXT: $L__BB257_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB258_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB258_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB258_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB258_1; +; SM90-NEXT: $L__BB258_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB259_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB259_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB259_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB259_1; +; SM90-NEXT: $L__BB259_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB260_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB260_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB260_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB260_1; +; SM90-NEXT: $L__BB260_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB261_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB261_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB261_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB261_1; +; SM90-NEXT: $L__BB261_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB262_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB262_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB262_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB262_1; +; SM90-NEXT: $L__BB262_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB263_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB263_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB263_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB263_1; +; SM90-NEXT: $L__BB263_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB264_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB264_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB264_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB264_1; +; SM90-NEXT: $L__BB264_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB265_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB265_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB265_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB265_1; +; SM90-NEXT: $L__BB265_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB266_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB266_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB266_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB266_1; +; SM90-NEXT: $L__BB266_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB267_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB267_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB267_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB267_1; +; SM90-NEXT: $L__BB267_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB268_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB268_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB268_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB268_1; +; SM90-NEXT: $L__BB268_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB269_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB269_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB269_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB269_1; +; SM90-NEXT: $L__BB269_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB270_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB270_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB270_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB270_1; +; SM90-NEXT: $L__BB270_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB271_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB271_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB271_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB271_1; +; SM90-NEXT: $L__BB271_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB272_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB272_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB272_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB272_1; +; SM90-NEXT: $L__BB272_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB273_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB273_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB273_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB273_1; +; SM90-NEXT: $L__BB273_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB274_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB274_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB274_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB274_1; +; SM90-NEXT: $L__BB274_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB275_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB275_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB275_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB275_1; +; SM90-NEXT: $L__BB275_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB276_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB276_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB276_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB276_1; +; SM90-NEXT: $L__BB276_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB277_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB277_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB277_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB277_1; +; SM90-NEXT: $L__BB277_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB278_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB278_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB278_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB278_1; +; SM90-NEXT: $L__BB278_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB279_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB279_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB279_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB279_1; +; SM90-NEXT: $L__BB279_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB280_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB280_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB280_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB280_1; +; SM90-NEXT: $L__BB280_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB281_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB281_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB281_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB281_1; +; SM90-NEXT: $L__BB281_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB282_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB282_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB282_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB282_1; +; SM90-NEXT: $L__BB282_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB283_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB283_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB283_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB283_1; +; SM90-NEXT: $L__BB283_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB284_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB284_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB284_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB284_1; +; SM90-NEXT: $L__BB284_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB285_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB285_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB285_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB285_1; +; SM90-NEXT: $L__BB285_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB286_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB286_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB286_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB286_1; +; SM90-NEXT: $L__BB286_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB287_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB287_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB287_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB287_1; +; SM90-NEXT: $L__BB287_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB288_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB288_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB288_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB288_1; +; SM90-NEXT: $L__BB288_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB289_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB289_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB289_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB289_1; +; SM90-NEXT: $L__BB289_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB290_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB290_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB290_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB290_1; +; SM90-NEXT: $L__BB290_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB291_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB291_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB291_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB291_1; +; SM90-NEXT: $L__BB291_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB292_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB292_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB292_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB292_1; +; SM90-NEXT: $L__BB292_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB293_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB293_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB293_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB293_1; +; SM90-NEXT: $L__BB293_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB294_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB294_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB294_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB294_1; +; SM90-NEXT: $L__BB294_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB295_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB295_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB295_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB295_1; +; SM90-NEXT: $L__BB295_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB296_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB296_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB296_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB296_1; +; SM90-NEXT: $L__BB296_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB297_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB297_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB297_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB297_1; +; SM90-NEXT: $L__BB297_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB298_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB298_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB298_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB298_1; +; SM90-NEXT: $L__BB298_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB299_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB299_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB299_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB299_1; +; SM90-NEXT: $L__BB299_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB300_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB300_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB300_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB300_1; +; SM90-NEXT: $L__BB300_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB301_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB301_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB301_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB301_1; +; SM90-NEXT: $L__BB301_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB302_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB302_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB302_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB302_1; +; SM90-NEXT: $L__BB302_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB303_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB303_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB303_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB303_1; +; SM90-NEXT: $L__BB303_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB304_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB304_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB304_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB304_1; +; SM90-NEXT: $L__BB304_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB305_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB305_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB305_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB305_1; +; SM90-NEXT: $L__BB305_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB306_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB306_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB306_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB306_1; +; SM90-NEXT: $L__BB306_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB307_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB307_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB307_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB307_1; +; SM90-NEXT: $L__BB307_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB308_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB308_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB308_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB308_1; +; SM90-NEXT: $L__BB308_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB309_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB309_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB309_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB309_1; +; SM90-NEXT: $L__BB309_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB310_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB310_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB310_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB310_1; +; SM90-NEXT: $L__BB310_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB311_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB311_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB311_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB311_1; +; SM90-NEXT: $L__BB311_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB312_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB312_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB312_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB312_1; +; SM90-NEXT: $L__BB312_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB313_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB313_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB313_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB313_1; +; SM90-NEXT: $L__BB313_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB314_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB314_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB314_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB314_1; +; SM90-NEXT: $L__BB314_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB315_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB315_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB315_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB315_1; +; SM90-NEXT: $L__BB315_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB316_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB316_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB316_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB316_1; +; SM90-NEXT: $L__BB316_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB317_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB317_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB317_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB317_1; +; SM90-NEXT: $L__BB317_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB318_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB318_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB318_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB318_1; +; SM90-NEXT: $L__BB318_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB319_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB319_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB319_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB319_1; +; SM90-NEXT: $L__BB319_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB320_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB320_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB320_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB320_1; +; SM90-NEXT: $L__BB320_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB321_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB321_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB321_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB321_1; +; SM90-NEXT: $L__BB321_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB322_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB322_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB322_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB322_1; +; SM90-NEXT: $L__BB322_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB323_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB323_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB323_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB323_1; +; SM90-NEXT: $L__BB323_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB324_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB324_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB324_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB324_1; +; SM90-NEXT: $L__BB324_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB325_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB325_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB325_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB325_1; +; SM90-NEXT: $L__BB325_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB326_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB326_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB326_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB326_1; +; SM90-NEXT: $L__BB326_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB327_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB327_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB327_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB327_1; +; SM90-NEXT: $L__BB327_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB328_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB328_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB328_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB328_1; +; SM90-NEXT: $L__BB328_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB329_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB329_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB329_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB329_1; +; SM90-NEXT: $L__BB329_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB330_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB330_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB330_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB330_1; +; SM90-NEXT: $L__BB330_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB331_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB331_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB331_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB331_1; +; SM90-NEXT: $L__BB331_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB332_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB332_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB332_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB332_1; +; SM90-NEXT: $L__BB332_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB333_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB333_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB333_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB333_1; +; SM90-NEXT: $L__BB333_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB334_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB334_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB334_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB334_1; +; SM90-NEXT: $L__BB334_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB335_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB335_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB335_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB335_1; +; SM90-NEXT: $L__BB335_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB336_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB336_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB336_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB336_1; +; SM90-NEXT: $L__BB336_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB337_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB337_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB337_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB337_1; +; SM90-NEXT: $L__BB337_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB338_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB338_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB338_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB338_1; +; SM90-NEXT: $L__BB338_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB339_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB339_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB339_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB339_1; +; SM90-NEXT: $L__BB339_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB340_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB340_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB340_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB340_1; +; SM90-NEXT: $L__BB340_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB341_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB341_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB341_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB341_1; +; SM90-NEXT: $L__BB341_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB342_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB342_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB342_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB342_1; +; SM90-NEXT: $L__BB342_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB343_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB343_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB343_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB343_1; +; SM90-NEXT: $L__BB343_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB344_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB344_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB344_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB344_1; +; SM90-NEXT: $L__BB344_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB345_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB345_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB345_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB345_1; +; SM90-NEXT: $L__BB345_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB346_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB346_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB346_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB346_1; +; SM90-NEXT: $L__BB346_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB347_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB347_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB347_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB347_1; +; SM90-NEXT: $L__BB347_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB348_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB348_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB348_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB348_1; +; SM90-NEXT: $L__BB348_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB349_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB349_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB349_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB349_1; +; SM90-NEXT: $L__BB349_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB350_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB350_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB350_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB350_1; +; SM90-NEXT: $L__BB350_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB351_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB351_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB351_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB351_1; +; SM90-NEXT: $L__BB351_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB352_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB352_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB352_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB352_1; +; SM90-NEXT: $L__BB352_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB353_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB353_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB353_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB353_1; +; SM90-NEXT: $L__BB353_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB354_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB354_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB354_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB354_1; +; SM90-NEXT: $L__BB354_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB355_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB355_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB355_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB355_1; +; SM90-NEXT: $L__BB355_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB356_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB356_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB356_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB356_1; +; SM90-NEXT: $L__BB356_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB357_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB357_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB357_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB357_1; +; SM90-NEXT: $L__BB357_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB358_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB358_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB358_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB358_1; +; SM90-NEXT: $L__BB358_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB359_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB359_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB359_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB359_1; +; SM90-NEXT: $L__BB359_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB360_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB360_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB360_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB360_1; +; SM90-NEXT: $L__BB360_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB361_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB361_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB361_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB361_1; +; SM90-NEXT: $L__BB361_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB362_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB362_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB362_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB362_1; +; SM90-NEXT: $L__BB362_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB363_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB363_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB363_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB363_1; +; SM90-NEXT: $L__BB363_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB364_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB364_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB364_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB364_1; +; SM90-NEXT: $L__BB364_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB365_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB365_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB365_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB365_1; +; SM90-NEXT: $L__BB365_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB366_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB366_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB366_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB366_1; +; SM90-NEXT: $L__BB366_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB367_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB367_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB367_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB367_1; +; SM90-NEXT: $L__BB367_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB368_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB368_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB368_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB368_1; +; SM90-NEXT: $L__BB368_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB369_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB369_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB369_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB369_1; +; SM90-NEXT: $L__BB369_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB370_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB370_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB370_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB370_1; +; SM90-NEXT: $L__BB370_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB371_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB371_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB371_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB371_1; +; SM90-NEXT: $L__BB371_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB372_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB372_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB372_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB372_1; +; SM90-NEXT: $L__BB372_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB373_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB373_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB373_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB373_1; +; SM90-NEXT: $L__BB373_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB374_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB374_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB374_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB374_1; +; SM90-NEXT: $L__BB374_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB375_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB375_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB375_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB375_1; +; SM90-NEXT: $L__BB375_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB376_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB376_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB376_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB376_1; +; SM90-NEXT: $L__BB376_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB377_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB377_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB377_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB377_1; +; SM90-NEXT: $L__BB377_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB378_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB378_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB378_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB378_1; +; SM90-NEXT: $L__BB378_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB379_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB379_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB379_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB379_1; +; SM90-NEXT: $L__BB379_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB380_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB380_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB380_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB380_1; +; SM90-NEXT: $L__BB380_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB381_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB381_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB381_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB381_1; +; SM90-NEXT: $L__BB381_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB382_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB382_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB382_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB382_1; +; SM90-NEXT: $L__BB382_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB383_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB383_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB383_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB383_1; +; SM90-NEXT: $L__BB383_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB384_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB384_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB384_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB384_1; +; SM90-NEXT: $L__BB384_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB385_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB385_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB385_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB385_1; +; SM90-NEXT: $L__BB385_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB386_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB386_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB386_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB386_1; +; SM90-NEXT: $L__BB386_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB387_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB387_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB387_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB387_1; +; SM90-NEXT: $L__BB387_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB388_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB388_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB388_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB388_1; +; SM90-NEXT: $L__BB388_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB389_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB389_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB389_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB389_1; +; SM90-NEXT: $L__BB389_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB390_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB390_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB390_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB390_1; +; SM90-NEXT: $L__BB390_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB391_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB391_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB391_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB391_1; +; SM90-NEXT: $L__BB391_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB392_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB392_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB392_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB392_1; +; SM90-NEXT: $L__BB392_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB393_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB393_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB393_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB393_1; +; SM90-NEXT: $L__BB393_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB394_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB394_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB394_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB394_1; +; SM90-NEXT: $L__BB394_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB395_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB395_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB395_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB395_1; +; SM90-NEXT: $L__BB395_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB396_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB396_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB396_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB396_1; +; SM90-NEXT: $L__BB396_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB397_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB397_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB397_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB397_1; +; SM90-NEXT: $L__BB397_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB398_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB398_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB398_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB398_1; +; SM90-NEXT: $L__BB398_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB399_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB399_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB399_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB399_1; +; SM90-NEXT: $L__BB399_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB400_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB400_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB400_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB400_1; +; SM90-NEXT: $L__BB400_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB401_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB401_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB401_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB401_1; +; SM90-NEXT: $L__BB401_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB402_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB402_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB402_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB402_1; +; SM90-NEXT: $L__BB402_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB403_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB403_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB403_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB403_1; +; SM90-NEXT: $L__BB403_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB404_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB404_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB404_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB404_1; +; SM90-NEXT: $L__BB404_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB405_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB405_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB405_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB405_1; +; SM90-NEXT: $L__BB405_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB406_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB406_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB406_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB406_1; +; SM90-NEXT: $L__BB406_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB407_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB407_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB407_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB407_1; +; SM90-NEXT: $L__BB407_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB408_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB408_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB408_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB408_1; +; SM90-NEXT: $L__BB408_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB409_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB409_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB409_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB409_1; +; SM90-NEXT: $L__BB409_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB410_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB410_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB410_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB410_1; +; SM90-NEXT: $L__BB410_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB411_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB411_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB411_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB411_1; +; SM90-NEXT: $L__BB411_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB412_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB412_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB412_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB412_1; +; SM90-NEXT: $L__BB412_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB413_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB413_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB413_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB413_1; +; SM90-NEXT: $L__BB413_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB414_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB414_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB414_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB414_1; +; SM90-NEXT: $L__BB414_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB415_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB415_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB415_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB415_1; +; SM90-NEXT: $L__BB415_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB416_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB416_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB416_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB416_1; +; SM90-NEXT: $L__BB416_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB417_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB417_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB417_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB417_1; +; SM90-NEXT: $L__BB417_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB418_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB418_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB418_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB418_1; +; SM90-NEXT: $L__BB418_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB419_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB419_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB419_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB419_1; +; SM90-NEXT: $L__BB419_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB420_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB420_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB420_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB420_1; +; SM90-NEXT: $L__BB420_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB421_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB421_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB421_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB421_1; +; SM90-NEXT: $L__BB421_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB422_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB422_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB422_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB422_1; +; SM90-NEXT: $L__BB422_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB423_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB423_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB423_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB423_1; +; SM90-NEXT: $L__BB423_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB424_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB424_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB424_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB424_1; +; SM90-NEXT: $L__BB424_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB425_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB425_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB425_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB425_1; +; SM90-NEXT: $L__BB425_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB426_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB426_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB426_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB426_1; +; SM90-NEXT: $L__BB426_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB427_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB427_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB427_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB427_1; +; SM90-NEXT: $L__BB427_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB428_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB428_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB428_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB428_1; +; SM90-NEXT: $L__BB428_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB429_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB429_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB429_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB429_1; +; SM90-NEXT: $L__BB429_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB430_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB430_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB430_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB430_1; +; SM90-NEXT: $L__BB430_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB431_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB431_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB431_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB431_1; +; SM90-NEXT: $L__BB431_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB432_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB432_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB432_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB432_1; +; SM90-NEXT: $L__BB432_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB433_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB433_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB433_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB433_1; +; SM90-NEXT: $L__BB433_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB434_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB434_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB434_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB434_1; +; SM90-NEXT: $L__BB434_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB435_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB435_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB435_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB435_1; +; SM90-NEXT: $L__BB435_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB436_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB436_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB436_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB436_1; +; SM90-NEXT: $L__BB436_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB437_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB437_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB437_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB437_1; +; SM90-NEXT: $L__BB437_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB438_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB438_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB438_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB438_1; +; SM90-NEXT: $L__BB438_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB439_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB439_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB439_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB439_1; +; SM90-NEXT: $L__BB439_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB440_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB440_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB440_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB440_1; +; SM90-NEXT: $L__BB440_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB441_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB441_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB441_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB441_1; +; SM90-NEXT: $L__BB441_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB442_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB442_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB442_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB442_1; +; SM90-NEXT: $L__BB442_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB443_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB443_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB443_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB443_1; +; SM90-NEXT: $L__BB443_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB444_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB444_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB444_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB444_1; +; SM90-NEXT: $L__BB444_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB445_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB445_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB445_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB445_1; +; SM90-NEXT: $L__BB445_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB446_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB446_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB446_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB446_1; +; SM90-NEXT: $L__BB446_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB447_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB447_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB447_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB447_1; +; SM90-NEXT: $L__BB447_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB448_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB448_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB448_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB448_1; +; SM90-NEXT: $L__BB448_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB449_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB449_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB449_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB449_1; +; SM90-NEXT: $L__BB449_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2]; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2]; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2]; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_param_2]; +; SM90-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: atom.release.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.release.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.release.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_param_2]; +; SM90-NEXT: atom.release.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: atom.release.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: atom.release.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: atom.release.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_param_2]; +; SM90-NEXT: atom.release.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: atom.release.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: atom.release.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.release.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.release.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2]; +; SM90-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: atom.relaxed.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.relaxed.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.relaxed.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2]; +; SM90-NEXT: atom.relaxed.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: atom.relaxed.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_global_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2]; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_global_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_global_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_global_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_global_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_global_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new +} + +define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2]; +; SM90-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: atom.release.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.release.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.release.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_param_2]; +; SM90-NEXT: atom.release.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: atom.release.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_global_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: atom.release.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: atom.release.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2]; +; SM90-NEXT: atom.release.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: atom.release.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: atom.release.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.release.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.release.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new +} + +define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_global_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new +} + +define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_global_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") release seq_cst + ret i64 %new } -define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_generic( +define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_shared_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new } -define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_global( +define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_generic( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new } -define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_shared( +define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_generic_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_generic( +define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_generic_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_global( +define i64 @acq_rel_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_generic_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel monotonic + ret i64 %new } -define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_shared( +define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_generic_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_generic( +define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_global( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new } -define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_global( +define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_global_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_shared( +define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_global_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_generic( +define i64 @acq_rel_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_global_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel monotonic + ret i64 %new } -define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_global( +define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_global_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_shared( +define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_shared( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new } -define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_generic( +define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_shared_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_global( +define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_shared_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_shared( +define i64 @acq_rel_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_shared_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel monotonic + ret i64 %new } -define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_generic( +define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_shared_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_global( +define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + +define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_generic_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire + ret i64 %new } -define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_shared( +define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_generic_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire + ret i64 %new } -define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_generic( +define i64 @acq_rel_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_generic_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel acquire + ret i64 %new } -define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_global( +define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_generic_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire + ret i64 %new } -define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_shared( +define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_global( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new } -define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_generic( +define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_global_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire + ret i64 %new } -define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_global( +define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_global_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire + ret i64 %new } -define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_shared( +define i64 @acq_rel_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_global_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel acquire + ret i64 %new } -define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_generic( +define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_global_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire + ret i64 %new } -define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_global( +define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_shared( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new } -define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_shared( +define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_shared_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire + ret i64 %new } -define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_generic( +define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_shared_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire + ret i64 %new } -define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_global( +define i64 @acq_rel_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_shared_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel acquire + ret i64 %new } -define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_shared( +define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_shared_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire + ret i64 %new } -define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_generic( +define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_generic( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new } -define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_global( +define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_generic_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst + ret i64 %new } -define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_shared( +define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_generic_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_generic( +define i64 @acq_rel_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_generic_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel seq_cst + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_global( +define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_generic_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_shared( +define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_global( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new } -define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_generic( +define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_global_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst + ret i64 %new } -define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_global( +define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_global_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst + ret i64 %new } -define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_shared( +define i64 @acq_rel_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_global_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel seq_cst + ret i64 %new } -define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_generic( +define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_global_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst + ret i64 %new } -define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_global( +define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_shared( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new } -define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_shared( +define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_shared_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_generic( +define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_shared_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_global( +define i64 @acq_rel_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_shared_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel seq_cst + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_shared( +define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_shared_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst + ret i64 %new } -define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_generic( +define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_generic( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic ret i64 %new } -define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_global( +define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_shared( +define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_generic_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_generic( +define i64 @seq_cst_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst monotonic ret i64 %new } -define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_global( +define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_shared( +define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_global( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic ret i64 %new } -define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_generic( +define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_global( +define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_shared( +define i64 @seq_cst_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_global_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst monotonic ret i64 %new } -define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_generic( +define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_global_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_global( +define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_shared( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic ret i64 %new } -define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_shared( +define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_generic( +define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_shared_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_global( +define i64 @seq_cst_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst monotonic ret i64 %new } -define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_shared( +define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_generic( +define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_generic( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire ret i64 %new } -define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_global( +define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_shared( +define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_generic_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_generic( +define i64 @seq_cst_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst acquire ret i64 %new } -define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_global( +define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_shared( +define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_global( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire ret i64 %new } -define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_generic( +define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_global( +define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_shared( +define i64 @seq_cst_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_global_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst acquire ret i64 %new } -define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_generic( +define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_global_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_global( +define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_shared( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire ret i64 %new } -define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_shared( +define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_generic( +define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_shared_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_global( +define i64 @seq_cst_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_shared( +define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_generic( +define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_generic( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst ret i64 %new } -define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_global( +define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_shared( +define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_generic_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_generic( +define i64 @seq_cst_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst seq_cst ret i64 %new } -define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_global( +define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } -define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_shared( +define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_global( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_generic( +define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_global( +define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_shared( +define i64 @seq_cst_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_global_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_generic( +define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_global_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_global( +define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_shared( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_shared( +define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_generic( +define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_shared_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_global( +define i64 @seq_cst_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_shared( +define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll index 9eeff9d7c2b75..9d72c3f44d3a1 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll @@ -79,7 +79,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB0_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -206,7 +206,7 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB1_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -336,7 +336,7 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB2_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -466,7 +466,7 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB3_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -598,7 +598,7 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB4_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -726,7 +726,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB5_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -850,7 +850,7 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB6_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -977,7 +977,7 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB7_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1104,7 +1104,7 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB8_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1234,7 +1234,7 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB9_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1316,7 +1316,7 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [relaxed_sys_i32_param_0]; ; SM70-NEXT: ld.param.b32 %r1, [relaxed_sys_i32_param_1]; ; SM70-NEXT: ld.param.b32 %r2, [relaxed_sys_i32_param_2]; -; SM70-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; ; SM90-LABEL: relaxed_sys_i32( @@ -1358,7 +1358,7 @@ define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i32_param_0]; ; SM70-NEXT: ld.param.b32 %r1, [acq_rel_sys_i32_param_1]; ; SM70-NEXT: ld.param.b32 %r2, [acq_rel_sys_i32_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; ; SM90-LABEL: acq_rel_sys_i32( @@ -1400,7 +1400,7 @@ define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [acquire_sys_i32_param_0]; ; SM70-NEXT: ld.param.b32 %r1, [acquire_sys_i32_param_1]; ; SM70-NEXT: ld.param.b32 %r2, [acquire_sys_i32_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; ; SM90-LABEL: acquire_sys_i32( @@ -1442,7 +1442,7 @@ define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [release_sys_i32_param_0]; ; SM70-NEXT: ld.param.b32 %r1, [release_sys_i32_param_1]; ; SM70-NEXT: ld.param.b32 %r2, [release_sys_i32_param_2]; -; SM70-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; ; SM90-LABEL: release_sys_i32( @@ -1486,7 +1486,7 @@ define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: ld.param.b32 %r1, [seq_cst_sys_i32_param_1]; ; SM70-NEXT: ld.param.b32 %r2, [seq_cst_sys_i32_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; ; SM90-LABEL: seq_cst_sys_i32( @@ -1529,7 +1529,7 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [relaxed_sys_i64_param_0]; ; SM70-NEXT: ld.param.b64 %rd2, [relaxed_sys_i64_param_1]; ; SM70-NEXT: ld.param.b64 %rd3, [relaxed_sys_i64_param_2]; -; SM70-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; ; SM90-LABEL: relaxed_sys_i64( @@ -1568,7 +1568,7 @@ define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [acquire_sys_i64_param_0]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_sys_i64_param_1]; ; SM70-NEXT: ld.param.b64 %rd3, [acquire_sys_i64_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; ; SM90-LABEL: acquire_sys_i64( @@ -1607,7 +1607,7 @@ define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i64_param_0]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i64_param_1]; ; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_sys_i64_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; ; SM90-LABEL: acq_rel_sys_i64( @@ -1646,7 +1646,7 @@ define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [release_sys_i64_param_0]; ; SM70-NEXT: ld.param.b64 %rd2, [release_sys_i64_param_1]; ; SM70-NEXT: ld.param.b64 %rd3, [release_sys_i64_param_2]; -; SM70-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; ; SM90-LABEL: release_sys_i64( @@ -1687,7 +1687,7 @@ define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i64_param_1]; ; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_sys_i64_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; ; SM90-LABEL: seq_cst_sys_i64( diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.py b/llvm/test/CodeGen/NVPTX/cmpxchg.py index ae7450015ecd2..3f02b9d28fdee 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.py +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.py @@ -5,6 +5,14 @@ from itertools import product cmpxchg_func = Template( + """define i$size @${success}_${failure}_i${size}_${addrspace}_${ptx_scope}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) { + %pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new syncscope(\"${llvm_scope}\") $success $failure + ret i$size %new +} +""" +) + +cmpxchg_func_no_scope = Template( """define i$size @${success}_${failure}_i${size}_${addrspace}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) { %pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new $success $failure ret i$size %new @@ -45,8 +53,9 @@ addrspace_cast = "" else: addrspace_cast = " addrspace({})".format(str(addrspace)) + # Test default scope print( - cmpxchg_func.substitute( + cmpxchg_func_no_scope.substitute( success=success, failure=failure, size=size, @@ -55,3 +64,20 @@ ), file=fp, ) + + for llvm_scope in LLVM_SCOPES: + # cluster ordering is supported from SM90 onwards + if sm < 90 and llvm_scope == "cluster": + continue + print( + cmpxchg_func.substitute( + success=success, + failure=failure, + size=size, + addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], + addrspace_cast=addrspace_cast, + llvm_scope=llvm_scope, + ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope], + ), + file=fp, + ) diff --git a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll index cea3ac37c1964..45202ee010328 100644 --- a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll +++ b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll @@ -172,30 +172,30 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ld.param.b64 %rd2, [test_distributed_shared_cluster_cmpxchg_param_0]; -; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r24, [%rd2], 1, 0; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r25, [%rd2], 1, 0; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r26, [%rd2], 1, 0; -; CHECK-NEXT: atom.release.shared::cluster.cas.b32 %r27, [%rd2], 1, 0; -; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b32 %r28, [%rd2], 1, 0; -; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b32 %r29, [%rd2], 1, 0; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r24, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r25, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r26, [%rd2], 1, 0; +; CHECK-NEXT: atom.release.sys.shared::cluster.cas.b32 %r27, [%rd2], 1, 0; +; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b32 %r28, [%rd2], 1, 0; +; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b32 %r29, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r30, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r30, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r31, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r31, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r32, [%rd2], 1, 0; -; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b64 %rd3, [%rd2], 1, 0; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd4, [%rd2], 1, 0; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd5, [%rd2], 1, 0; -; CHECK-NEXT: atom.release.shared::cluster.cas.b64 %rd6, [%rd2], 1, 0; -; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b64 %rd7, [%rd2], 1, 0; -; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b64 %rd8, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r32, [%rd2], 1, 0; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b64 %rd3, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd4, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd5, [%rd2], 1, 0; +; CHECK-NEXT: atom.release.sys.shared::cluster.cas.b64 %rd6, [%rd2], 1, 0; +; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b64 %rd7, [%rd2], 1, 0; +; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b64 %rd8, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd9, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd9, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd10, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd10, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd11, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd11, [%rd2], 1, 0; ; CHECK-NEXT: and.b64 %rd1, %rd2, -4; ; CHECK-NEXT: cvt.u32.u64 %r33, %rd2; ; CHECK-NEXT: and.b32 %r34, %r33, 3; @@ -210,7 +210,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-NEXT: $L__BB4_1: // %partword.cmpxchg.loop33 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: or.b32 %r39, %r48, %r3; -; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r6, [%rd1], %r39, %r48; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r6, [%rd1], %r39, %r48; ; CHECK-NEXT: setp.eq.s32 %p1, %r6, %r39; ; CHECK-NEXT: @%p1 bra $L__BB4_3; ; CHECK-NEXT: // %bb.2: // %partword.cmpxchg.failure32 @@ -225,7 +225,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-NEXT: $L__BB4_4: // %partword.cmpxchg.loop23 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: or.b32 %r41, %r49, %r3; -; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r10, [%rd1], %r41, %r49; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r10, [%rd1], %r41, %r49; ; CHECK-NEXT: setp.eq.s32 %p3, %r10, %r41; ; CHECK-NEXT: @%p3 bra $L__BB4_6; ; CHECK-NEXT: // %bb.5: // %partword.cmpxchg.failure22 @@ -242,7 +242,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-NEXT: $L__BB4_7: // %partword.cmpxchg.loop13 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: or.b32 %r43, %r50, %r3; -; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r14, [%rd1], %r43, %r50; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r14, [%rd1], %r43, %r50; ; CHECK-NEXT: setp.eq.s32 %p5, %r14, %r43; ; CHECK-NEXT: @%p5 bra $L__BB4_9; ; CHECK-NEXT: // %bb.8: // %partword.cmpxchg.failure12 @@ -258,7 +258,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-NEXT: $L__BB4_10: // %partword.cmpxchg.loop3 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: or.b32 %r45, %r51, %r3; -; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r18, [%rd1], %r45, %r51; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r18, [%rd1], %r45, %r51; ; CHECK-NEXT: setp.eq.s32 %p7, %r18, %r45; ; CHECK-NEXT: @%p7 bra $L__BB4_12; ; CHECK-NEXT: // %bb.11: // %partword.cmpxchg.failure2 @@ -275,7 +275,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-NEXT: $L__BB4_13: // %partword.cmpxchg.loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: or.b32 %r47, %r52, %r3; -; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r22, [%rd1], %r47, %r52; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r22, [%rd1], %r47, %r52; ; CHECK-NEXT: setp.eq.s32 %p9, %r22, %r47; ; CHECK-NEXT: @%p9 bra $L__BB4_15; ; CHECK-NEXT: // %bb.14: // %partword.cmpxchg.failure