[NVPTX] Use LDG for pointer induction variables.

broune · broune · commit 5cbc7d2999be · 2015-08-05T23:11:57.000Z
More specifically, make NVPTXISelDAGToDAG able to emit cached loads (LDG) for pointer induction variables.

Also fix latent bug where LDG was not restricted to kernel functions. I believe that this could not be triggered so far since we do not currently infer that a pointer is global outside a kernel function, and only loads of global pointers are considered for cached loads.

llvm-svn: 244166
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "NVPTXISelDAGToDAG.h"
+#include "NVPTXUtilities.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instructions.h"
@@ -546,18 +547,36 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
 }
 
 static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
-                          unsigned codeAddrSpace, const DataLayout &DL) {
-  if (!Subtarget.hasLDG() || codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL) {
+                          unsigned CodeAddrSpace, MachineFunction *F) {
+  // To use non-coherent caching, the load has to be from global
+  // memory and we have to prove that the memory area is not written
+  // to anywhere for the duration of the kernel call, not even after
+  // the load.
+  //
+  // To ensure that there are no writes to the memory, we require the
+  // underlying pointer to be a noalias (__restrict) kernel parameter
+  // that is never used for a write. We can only do this for kernel
+  // functions since from within a device function, we cannot know if
+  // there were or will be writes to the memory from the caller - or we
+  // could, but then we would have to do inter-procedural analysis.
+  if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL ||
+      !isKernelFunction(*F->getFunction())) {
     return false;
   }
 
-  // Check whether load operates on a readonly argument.
-  bool canUseLDG = false;
-  if (const Argument *A = dyn_cast<const Argument>(
-          GetUnderlyingObject(N->getMemOperand()->getValue(), DL)))
-    canUseLDG = A->onlyReadsMemory() && A->hasNoAliasAttr();
+  // We use GetUnderlyingObjects() here instead of
+  // GetUnderlyingObject() mainly because the former looks through phi
+  // nodes while the latter does not. We need to look through phi
+  // nodes to handle pointer induction variables.
+  SmallVector<Value *, 8> Objs;
+  GetUnderlyingObjects(const_cast<Value *>(N->getMemOperand()->getValue()),
+                       Objs, F->getDataLayout());
+  for (Value *Obj : Objs) {
+    auto *A = dyn_cast<const Argument>(Obj);
+    if (!A || !A->onlyReadsMemory() || !A->hasNoAliasAttr()) return false;
+  }
 
-  return canUseLDG;
+  return true;
 }
 
 SDNode *NVPTXDAGToDAGISel::SelectIntrinsicNoChain(SDNode *N) {
@@ -654,7 +673,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
   // Address Space Setting
   unsigned int codeAddrSpace = getCodeAddrSpace(LD);
 
-  if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, CurDAG->getDataLayout())) {
+  if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, MF)) {
     return SelectLDGLDU(N);
   }
 
@@ -892,7 +911,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
   // Address Space Setting
   unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
 
-  if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, CurDAG->getDataLayout())) {
+  if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
     return SelectLDGLDU(N);
   }
 
diff --git a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
@@ -189,7 +189,60 @@ define void @foo18(float ** noalias readonly %from, float ** %to) {
   ret void
 }
 
-!nvvm.annotations = !{!1 ,!2 ,!3 ,!4 ,!5 ,!6, !7 ,!8 ,!9 ,!10 ,!11 ,!12, !13, !14, !15, !16, !17, !18}
+; Test that we can infer a cached load for a pointer induction variable.
+; SM20-LABEL: .visible .entry foo19(
+; SM20: ld.global.f32
+; SM35-LABEL: .visible .entry foo19(
+; SM35: ld.global.nc.f32
+define void @foo19(float * noalias readonly %from, float * %to, i32 %n) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %nexti, %loop ]
+  %sum = phi float [ 0.0, %entry ], [ %nextsum, %loop ]
+  %ptr = getelementptr inbounds float, float * %from, i32 %i
+  %value = load float, float * %ptr, align 4
+  %nextsum = fadd float %value, %sum
+  %nexti = add nsw i32 %i, 1
+  %exitcond = icmp eq i32 %nexti, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  store float %nextsum, float * %to
+  ret void
+}
+
+; This test captures the case of a non-kernel function. In a
+; non-kernel function, without interprocedural analysis, we do not
+; know that the parameter is global. We also do not know that the
+; pointed-to memory is never written to (for the duration of the
+; kernel). For both reasons, we cannot use a cached load here.
+; SM20-LABEL: notkernel(
+; SM20: ld.f32
+; SM35-LABEL: notkernel(
+; SM35: ld.f32
+define void @notkernel(float * noalias readonly %from, float * %to) {
+  %1 = load float, float * %from
+  store float %1, float * %to
+  ret void
+}
+
+; As @notkernel, but with the parameter explicitly marked as global. We still
+; do not know that the parameter is never written to (for the duration of the
+; kernel). This case does not currently come up normally since we do not infer
+; that pointers are global interprocedurally as of 2015-08-05.
+; SM20-LABEL: notkernel2(
+; SM20: ld.global.f32
+; SM35-LABEL: notkernel2(
+; SM35: ld.global.f32
+define void @notkernel2(float addrspace(1) * noalias readonly %from, float * %to) {
+  %1 = load float, float addrspace(1) * %from
+  store float %1, float * %to
+  ret void
+}
+
+!nvvm.annotations = !{!1 ,!2 ,!3 ,!4 ,!5 ,!6, !7 ,!8 ,!9 ,!10 ,!11 ,!12, !13, !14, !15, !16, !17, !18, !19}
 !1 = !{void (float *, float *)* @foo1, !"kernel", i32 1}
 !2 = !{void (double *, double *)* @foo2, !"kernel", i32 1}
 !3 = !{void (i16 *, i16 *)* @foo3, !"kernel", i32 1}
@@ -208,3 +261,4 @@ define void @foo18(float ** noalias readonly %from, float ** %to) {
 !16 = !{void (<4 x float> *, <4 x float> *)* @foo16, !"kernel", i32 1}
 !17 = !{void (<4 x double> *, <4 x double> *)* @foo17, !"kernel", i32 1}
 !18 = !{void (float **, float **)* @foo18, !"kernel", i32 1}
+!19 = !{void (float *, float *, i32)* @foo19, !"kernel", i32 1}