Skip to content

Commit 5cbc7d2

Browse files
committed
[NVPTX] Use LDG for pointer induction variables.
More specifically, make NVPTXISelDAGToDAG able to emit cached loads (LDG) for pointer induction variables. Also fix latent bug where LDG was not restricted to kernel functions. I believe that this could not be triggered so far since we do not currently infer that a pointer is global outside a kernel function, and only loads of global pointers are considered for cached loads. llvm-svn: 244166
1 parent 4cc10d4 commit 5cbc7d2

File tree

2 files changed

+84
-11
lines changed

2 files changed

+84
-11
lines changed

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
//===----------------------------------------------------------------------===//
1313

1414
#include "NVPTXISelDAGToDAG.h"
15+
#include "NVPTXUtilities.h"
1516
#include "llvm/Analysis/ValueTracking.h"
1617
#include "llvm/IR/GlobalValue.h"
1718
#include "llvm/IR/Instructions.h"
@@ -546,18 +547,36 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
546547
}
547548

548549
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
549-
unsigned codeAddrSpace, const DataLayout &DL) {
550-
if (!Subtarget.hasLDG() || codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL) {
550+
unsigned CodeAddrSpace, MachineFunction *F) {
551+
// To use non-coherent caching, the load has to be from global
552+
// memory and we have to prove that the memory area is not written
553+
// to anywhere for the duration of the kernel call, not even after
554+
// the load.
555+
//
556+
// To ensure that there are no writes to the memory, we require the
557+
// underlying pointer to be a noalias (__restrict) kernel parameter
558+
// that is never used for a write. We can only do this for kernel
559+
// functions since from within a device function, we cannot know if
560+
// there were or will be writes to the memory from the caller - or we
561+
// could, but then we would have to do inter-procedural analysis.
562+
if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL ||
563+
!isKernelFunction(*F->getFunction())) {
551564
return false;
552565
}
553566

554-
// Check whether load operates on a readonly argument.
555-
bool canUseLDG = false;
556-
if (const Argument *A = dyn_cast<const Argument>(
557-
GetUnderlyingObject(N->getMemOperand()->getValue(), DL)))
558-
canUseLDG = A->onlyReadsMemory() && A->hasNoAliasAttr();
567+
// We use GetUnderlyingObjects() here instead of
568+
// GetUnderlyingObject() mainly because the former looks through phi
569+
// nodes while the latter does not. We need to look through phi
570+
// nodes to handle pointer induction variables.
571+
SmallVector<Value *, 8> Objs;
572+
GetUnderlyingObjects(const_cast<Value *>(N->getMemOperand()->getValue()),
573+
Objs, F->getDataLayout());
574+
for (Value *Obj : Objs) {
575+
auto *A = dyn_cast<const Argument>(Obj);
576+
if (!A || !A->onlyReadsMemory() || !A->hasNoAliasAttr()) return false;
577+
}
559578

560-
return canUseLDG;
579+
return true;
561580
}
562581

563582
SDNode *NVPTXDAGToDAGISel::SelectIntrinsicNoChain(SDNode *N) {
@@ -654,7 +673,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
654673
// Address Space Setting
655674
unsigned int codeAddrSpace = getCodeAddrSpace(LD);
656675

657-
if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, CurDAG->getDataLayout())) {
676+
if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, MF)) {
658677
return SelectLDGLDU(N);
659678
}
660679

@@ -892,7 +911,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
892911
// Address Space Setting
893912
unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
894913

895-
if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, CurDAG->getDataLayout())) {
914+
if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
896915
return SelectLDGLDU(N);
897916
}
898917

llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,60 @@ define void @foo18(float ** noalias readonly %from, float ** %to) {
189189
ret void
190190
}
191191

192-
!nvvm.annotations = !{!1 ,!2 ,!3 ,!4 ,!5 ,!6, !7 ,!8 ,!9 ,!10 ,!11 ,!12, !13, !14, !15, !16, !17, !18}
192+
; Test that we can infer a cached load for a pointer induction variable.
193+
; SM20-LABEL: .visible .entry foo19(
194+
; SM20: ld.global.f32
195+
; SM35-LABEL: .visible .entry foo19(
196+
; SM35: ld.global.nc.f32
197+
define void @foo19(float * noalias readonly %from, float * %to, i32 %n) {
198+
entry:
199+
br label %loop
200+
201+
loop:
202+
%i = phi i32 [ 0, %entry ], [ %nexti, %loop ]
203+
%sum = phi float [ 0.0, %entry ], [ %nextsum, %loop ]
204+
%ptr = getelementptr inbounds float, float * %from, i32 %i
205+
%value = load float, float * %ptr, align 4
206+
%nextsum = fadd float %value, %sum
207+
%nexti = add nsw i32 %i, 1
208+
%exitcond = icmp eq i32 %nexti, %n
209+
br i1 %exitcond, label %exit, label %loop
210+
211+
exit:
212+
store float %nextsum, float * %to
213+
ret void
214+
}
215+
216+
; This test captures the case of a non-kernel function. In a
217+
; non-kernel function, without interprocedural analysis, we do not
218+
; know that the parameter is global. We also do not know that the
219+
; pointed-to memory is never written to (for the duration of the
220+
; kernel). For both reasons, we cannot use a cached load here.
221+
; SM20-LABEL: notkernel(
222+
; SM20: ld.f32
223+
; SM35-LABEL: notkernel(
224+
; SM35: ld.f32
225+
define void @notkernel(float * noalias readonly %from, float * %to) {
226+
%1 = load float, float * %from
227+
store float %1, float * %to
228+
ret void
229+
}
230+
231+
; As @notkernel, but with the parameter explicitly marked as global. We still
232+
; do not know that the parameter is never written to (for the duration of the
233+
; kernel). This case does not currently come up normally since we do not infer
234+
; that pointers are global interprocedurally as of 2015-08-05.
235+
; SM20-LABEL: notkernel2(
236+
; SM20: ld.global.f32
237+
; SM35-LABEL: notkernel2(
238+
; SM35: ld.global.f32
239+
define void @notkernel2(float addrspace(1) * noalias readonly %from, float * %to) {
240+
%1 = load float, float addrspace(1) * %from
241+
store float %1, float * %to
242+
ret void
243+
}
244+
245+
!nvvm.annotations = !{!1 ,!2 ,!3 ,!4 ,!5 ,!6, !7 ,!8 ,!9 ,!10 ,!11 ,!12, !13, !14, !15, !16, !17, !18, !19}
193246
!1 = !{void (float *, float *)* @foo1, !"kernel", i32 1}
194247
!2 = !{void (double *, double *)* @foo2, !"kernel", i32 1}
195248
!3 = !{void (i16 *, i16 *)* @foo3, !"kernel", i32 1}
@@ -208,3 +261,4 @@ define void @foo18(float ** noalias readonly %from, float ** %to) {
208261
!16 = !{void (<4 x float> *, <4 x float> *)* @foo16, !"kernel", i32 1}
209262
!17 = !{void (<4 x double> *, <4 x double> *)* @foo17, !"kernel", i32 1}
210263
!18 = !{void (float **, float **)* @foo18, !"kernel", i32 1}
264+
!19 = !{void (float *, float *, i32)* @foo19, !"kernel", i32 1}

0 commit comments

Comments
 (0)