Skip to content

Commit 469757e

Browse files
authored
[AMDGPU] Handle memcpy()-like ops in LowerBufferFatPointers (#126621)
Since LowerBufferFatPointers runs before PreISelIntrinsicLowering, which normally handles unsupported memcpy()s,, and since you can't have a `noalias {ptr addrspace(8), i32}` becasue it crashes later passes, manually expand memcpy()s involving buffer fat pointers to loops. Additionally, though they're unlikely to be used, this commit adds support for memset(). This commit doesn't implement writing direct-to-LDS loads as the intrinsics, but leaves the option in the future.
1 parent 42526d2 commit 469757e

File tree

3 files changed

+3156
-24
lines changed

3 files changed

+3156
-24
lines changed

llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp

Lines changed: 98 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -45,16 +45,16 @@
4545
//
4646
// This pass proceeds in three main phases:
4747
//
48-
// ## Rewriting loads and stores of p7
48+
// ## Rewriting loads and stores of p7 and memcpy()-like handling
4949
//
5050
// The first phase is to rewrite away all loads and stors of `ptr addrspace(7)`,
5151
// including aggregates containing such pointers, to ones that use `i160`. This
52-
// is handled by `StoreFatPtrsAsIntsVisitor` , which visits loads, stores, and
53-
// allocas and, if the loaded or stored type contains `ptr addrspace(7)`,
54-
// rewrites that type to one where the p7s are replaced by i160s, copying other
55-
// parts of aggregates as needed. In the case of a store, each pointer is
56-
// `ptrtoint`d to i160 before storing, and load integers are `inttoptr`d back.
57-
// This same transformation is applied to vectors of pointers.
52+
// is handled by `StoreFatPtrsAsIntsAndExpandMemcpyVisitor` , which visits
53+
// loads, stores, and allocas and, if the loaded or stored type contains `ptr
54+
// addrspace(7)`, rewrites that type to one where the p7s are replaced by i160s,
55+
// copying other parts of aggregates as needed. In the case of a store, each
56+
// pointer is `ptrtoint`d to i160 before storing, and load integers are
57+
// `inttoptr`d back. This same transformation is applied to vectors of pointers.
5858
//
5959
// Such a transformation allows the later phases of the pass to not need
6060
// to handle buffer fat pointers moving to and from memory, where we load
@@ -66,6 +66,10 @@
6666
// Atomics operations on `ptr addrspace(7)` values are not suppported, as the
6767
// hardware does not include a 160-bit atomic.
6868
//
69+
// In order to save on O(N) work and to ensure that the contents type
70+
// legalizer correctly splits up wide loads, also unconditionally lower
71+
// memcpy-like intrinsics into loops here.
72+
//
6973
// ## Buffer contents type legalization
7074
//
7175
// The underlying buffer intrinsics only support types up to 128 bits long,
@@ -231,20 +235,24 @@
231235
#include "llvm/IR/InstIterator.h"
232236
#include "llvm/IR/InstVisitor.h"
233237
#include "llvm/IR/Instructions.h"
238+
#include "llvm/IR/IntrinsicInst.h"
234239
#include "llvm/IR/Intrinsics.h"
235240
#include "llvm/IR/IntrinsicsAMDGPU.h"
236241
#include "llvm/IR/Metadata.h"
237242
#include "llvm/IR/Operator.h"
238243
#include "llvm/IR/PatternMatch.h"
239244
#include "llvm/IR/ReplaceConstant.h"
245+
#include "llvm/IR/ValueHandle.h"
240246
#include "llvm/InitializePasses.h"
241247
#include "llvm/Pass.h"
248+
#include "llvm/Support/AMDGPUAddrSpace.h"
242249
#include "llvm/Support/Alignment.h"
243250
#include "llvm/Support/AtomicOrdering.h"
244251
#include "llvm/Support/Debug.h"
245252
#include "llvm/Support/ErrorHandling.h"
246253
#include "llvm/Transforms/Utils/Cloning.h"
247254
#include "llvm/Transforms/Utils/Local.h"
255+
#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
248256
#include "llvm/Transforms/Utils/ValueMapper.h"
249257

250258
#define DEBUG_TYPE "amdgpu-lower-buffer-fat-pointers"
@@ -431,14 +439,16 @@ namespace {
431439
/// marshalling costs when reading or storing these values, but since placing
432440
/// such pointers into memory is an uncommon operation at best, we feel that
433441
/// this cost is acceptable for better performance in the common case.
434-
class StoreFatPtrsAsIntsVisitor
435-
: public InstVisitor<StoreFatPtrsAsIntsVisitor, bool> {
442+
class StoreFatPtrsAsIntsAndExpandMemcpyVisitor
443+
: public InstVisitor<StoreFatPtrsAsIntsAndExpandMemcpyVisitor, bool> {
436444
BufferFatPtrToIntTypeMap *TypeMap;
437445

438446
ValueToValueMapTy ConvertedForStore;
439447

440448
IRBuilder<> IRB;
441449

450+
const TargetMachine *TM;
451+
442452
// Convert all the buffer fat pointers within the input value to inttegers
443453
// so that it can be stored in memory.
444454
Value *fatPtrsToInts(Value *V, Type *From, Type *To, const Twine &Name);
@@ -448,20 +458,27 @@ class StoreFatPtrsAsIntsVisitor
448458
Value *intsToFatPtrs(Value *V, Type *From, Type *To, const Twine &Name);
449459

450460
public:
451-
StoreFatPtrsAsIntsVisitor(BufferFatPtrToIntTypeMap *TypeMap, LLVMContext &Ctx)
452-
: TypeMap(TypeMap), IRB(Ctx) {}
461+
StoreFatPtrsAsIntsAndExpandMemcpyVisitor(BufferFatPtrToIntTypeMap *TypeMap,
462+
LLVMContext &Ctx,
463+
const TargetMachine *TM)
464+
: TypeMap(TypeMap), IRB(Ctx), TM(TM) {}
453465
bool processFunction(Function &F);
454466

455467
bool visitInstruction(Instruction &I) { return false; }
456468
bool visitAllocaInst(AllocaInst &I);
457469
bool visitLoadInst(LoadInst &LI);
458470
bool visitStoreInst(StoreInst &SI);
459471
bool visitGetElementPtrInst(GetElementPtrInst &I);
472+
473+
bool visitMemCpyInst(MemCpyInst &MCI);
474+
bool visitMemMoveInst(MemMoveInst &MMI);
475+
bool visitMemSetInst(MemSetInst &MSI);
476+
bool visitMemSetPatternInst(MemSetPatternInst &MSPI);
460477
};
461478
} // namespace
462479

463-
Value *StoreFatPtrsAsIntsVisitor::fatPtrsToInts(Value *V, Type *From, Type *To,
464-
const Twine &Name) {
480+
Value *StoreFatPtrsAsIntsAndExpandMemcpyVisitor::fatPtrsToInts(
481+
Value *V, Type *From, Type *To, const Twine &Name) {
465482
if (From == To)
466483
return V;
467484
ValueToValueMapTy::iterator Find = ConvertedForStore.find(V);
@@ -498,8 +515,8 @@ Value *StoreFatPtrsAsIntsVisitor::fatPtrsToInts(Value *V, Type *From, Type *To,
498515
return Ret;
499516
}
500517

501-
Value *StoreFatPtrsAsIntsVisitor::intsToFatPtrs(Value *V, Type *From, Type *To,
502-
const Twine &Name) {
518+
Value *StoreFatPtrsAsIntsAndExpandMemcpyVisitor::intsToFatPtrs(
519+
Value *V, Type *From, Type *To, const Twine &Name) {
503520
if (From == To)
504521
return V;
505522
if (isBufferFatPtrOrVector(To)) {
@@ -531,18 +548,25 @@ Value *StoreFatPtrsAsIntsVisitor::intsToFatPtrs(Value *V, Type *From, Type *To,
531548
return Ret;
532549
}
533550

534-
bool StoreFatPtrsAsIntsVisitor::processFunction(Function &F) {
551+
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::processFunction(Function &F) {
535552
bool Changed = false;
536-
// The visitors will mutate GEPs and allocas, but will push loads and stores
537-
// to the worklist to avoid invalidation.
553+
// Process memcpy-like instructions after the main iteration because they can
554+
// invalidate iterators.
555+
SmallVector<WeakTrackingVH> CanBecomeLoops;
538556
for (Instruction &I : make_early_inc_range(instructions(F))) {
539-
Changed |= visit(I);
557+
if (isa<MemTransferInst, MemSetInst, MemSetPatternInst>(I))
558+
CanBecomeLoops.push_back(&I);
559+
else
560+
Changed |= visit(I);
561+
}
562+
for (WeakTrackingVH VH : make_early_inc_range(CanBecomeLoops)) {
563+
Changed |= visit(cast<Instruction>(VH));
540564
}
541565
ConvertedForStore.clear();
542566
return Changed;
543567
}
544568

545-
bool StoreFatPtrsAsIntsVisitor::visitAllocaInst(AllocaInst &I) {
569+
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitAllocaInst(AllocaInst &I) {
546570
Type *Ty = I.getAllocatedType();
547571
Type *NewTy = TypeMap->remapType(Ty);
548572
if (Ty == NewTy)
@@ -551,7 +575,8 @@ bool StoreFatPtrsAsIntsVisitor::visitAllocaInst(AllocaInst &I) {
551575
return true;
552576
}
553577

554-
bool StoreFatPtrsAsIntsVisitor::visitGetElementPtrInst(GetElementPtrInst &I) {
578+
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitGetElementPtrInst(
579+
GetElementPtrInst &I) {
555580
Type *Ty = I.getSourceElementType();
556581
Type *NewTy = TypeMap->remapType(Ty);
557582
if (Ty == NewTy)
@@ -563,7 +588,7 @@ bool StoreFatPtrsAsIntsVisitor::visitGetElementPtrInst(GetElementPtrInst &I) {
563588
return true;
564589
}
565590

566-
bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) {
591+
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitLoadInst(LoadInst &LI) {
567592
Type *Ty = LI.getType();
568593
Type *IntTy = TypeMap->remapType(Ty);
569594
if (Ty == IntTy)
@@ -581,7 +606,7 @@ bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) {
581606
return true;
582607
}
583608

584-
bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) {
609+
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitStoreInst(StoreInst &SI) {
585610
Value *V = SI.getValueOperand();
586611
Type *Ty = V->getType();
587612
Type *IntTy = TypeMap->remapType(Ty);
@@ -597,6 +622,47 @@ bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) {
597622
return true;
598623
}
599624

625+
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemCpyInst(
626+
MemCpyInst &MCI) {
627+
// TODO: Allow memcpy.p7.p3 as a synonym for the direct-to-LDS copy, which'll
628+
// need loop expansion here.
629+
if (MCI.getSourceAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER &&
630+
MCI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
631+
return false;
632+
llvm::expandMemCpyAsLoop(&MCI,
633+
TM->getTargetTransformInfo(*MCI.getFunction()));
634+
MCI.eraseFromParent();
635+
return true;
636+
}
637+
638+
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemMoveInst(
639+
MemMoveInst &MMI) {
640+
if (MMI.getSourceAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER &&
641+
MMI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
642+
return false;
643+
report_fatal_error(
644+
"memmove() on buffer descriptors is not implemented because pointer "
645+
"comparison on buffer descriptors isn't implemented\n");
646+
}
647+
648+
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetInst(
649+
MemSetInst &MSI) {
650+
if (MSI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
651+
return false;
652+
llvm::expandMemSetAsLoop(&MSI);
653+
MSI.eraseFromParent();
654+
return true;
655+
}
656+
657+
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetPatternInst(
658+
MemSetPatternInst &MSPI) {
659+
if (MSPI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
660+
return false;
661+
llvm::expandMemSetPatternAsLoop(&MSPI);
662+
MSPI.eraseFromParent();
663+
return true;
664+
}
665+
600666
namespace {
601667
/// Convert loads/stores of types that the buffer intrinsics can't handle into
602668
/// one ore more such loads/stores that consist of legal types.
@@ -1127,6 +1193,7 @@ bool LegalizeBufferContentTypesVisitor::visitStoreInst(StoreInst &SI) {
11271193

11281194
bool LegalizeBufferContentTypesVisitor::processFunction(Function &F) {
11291195
bool Changed = false;
1196+
// Note, memory transfer intrinsics won't
11301197
for (Instruction &I : make_early_inc_range(instructions(F))) {
11311198
Changed |= visit(I);
11321199
}
@@ -2084,6 +2151,12 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
20842151
case Intrinsic::invariant_end:
20852152
case Intrinsic::launder_invariant_group:
20862153
case Intrinsic::strip_invariant_group:
2154+
case Intrinsic::memcpy:
2155+
case Intrinsic::memcpy_inline:
2156+
case Intrinsic::memmove:
2157+
case Intrinsic::memset:
2158+
case Intrinsic::memset_inline:
2159+
case Intrinsic::experimental_memset_pattern:
20872160
return true;
20882161
}
20892162
}
@@ -2353,7 +2426,8 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
23532426
/*RemoveDeadConstants=*/false, /*IncludeSelf=*/true);
23542427
}
23552428

2356-
StoreFatPtrsAsIntsVisitor MemOpsRewrite(&IntTM, M.getContext());
2429+
StoreFatPtrsAsIntsAndExpandMemcpyVisitor MemOpsRewrite(&IntTM, M.getContext(),
2430+
&TM);
23572431
LegalizeBufferContentTypesVisitor BufferContentsTypeRewrite(DL,
23582432
M.getContext());
23592433
for (Function &F : M.functions()) {

0 commit comments

Comments
 (0)