45
45
//
46
46
// This pass proceeds in three main phases:
47
47
//
48
- // ## Rewriting loads and stores of p7
48
+ // ## Rewriting loads and stores of p7 and memcpy()-like handling
49
49
//
50
50
// The first phase is to rewrite away all loads and stors of `ptr addrspace(7)`,
51
51
// including aggregates containing such pointers, to ones that use `i160`. This
52
- // is handled by `StoreFatPtrsAsIntsVisitor ` , which visits loads, stores, and
53
- // allocas and, if the loaded or stored type contains `ptr addrspace(7)`,
54
- // rewrites that type to one where the p7s are replaced by i160s, copying other
55
- // parts of aggregates as needed. In the case of a store, each pointer is
56
- // `ptrtoint`d to i160 before storing, and load integers are `inttoptr`d back.
57
- // This same transformation is applied to vectors of pointers.
52
+ // is handled by `StoreFatPtrsAsIntsAndExpandMemcpyVisitor ` , which visits
53
+ // loads, stores, and allocas and, if the loaded or stored type contains `ptr
54
+ // addrspace(7)`, rewrites that type to one where the p7s are replaced by i160s,
55
+ // copying other parts of aggregates as needed. In the case of a store, each
56
+ // pointer is `ptrtoint`d to i160 before storing, and load integers are
57
+ // `inttoptr`d back. This same transformation is applied to vectors of pointers.
58
58
//
59
59
// Such a transformation allows the later phases of the pass to not need
60
60
// to handle buffer fat pointers moving to and from memory, where we load
66
66
// Atomics operations on `ptr addrspace(7)` values are not suppported, as the
67
67
// hardware does not include a 160-bit atomic.
68
68
//
69
+ // In order to save on O(N) work and to ensure that the contents type
70
+ // legalizer correctly splits up wide loads, also unconditionally lower
71
+ // memcpy-like intrinsics into loops here.
72
+ //
69
73
// ## Buffer contents type legalization
70
74
//
71
75
// The underlying buffer intrinsics only support types up to 128 bits long,
231
235
#include " llvm/IR/InstIterator.h"
232
236
#include " llvm/IR/InstVisitor.h"
233
237
#include " llvm/IR/Instructions.h"
238
+ #include " llvm/IR/IntrinsicInst.h"
234
239
#include " llvm/IR/Intrinsics.h"
235
240
#include " llvm/IR/IntrinsicsAMDGPU.h"
236
241
#include " llvm/IR/Metadata.h"
237
242
#include " llvm/IR/Operator.h"
238
243
#include " llvm/IR/PatternMatch.h"
239
244
#include " llvm/IR/ReplaceConstant.h"
245
+ #include " llvm/IR/ValueHandle.h"
240
246
#include " llvm/InitializePasses.h"
241
247
#include " llvm/Pass.h"
248
+ #include " llvm/Support/AMDGPUAddrSpace.h"
242
249
#include " llvm/Support/Alignment.h"
243
250
#include " llvm/Support/AtomicOrdering.h"
244
251
#include " llvm/Support/Debug.h"
245
252
#include " llvm/Support/ErrorHandling.h"
246
253
#include " llvm/Transforms/Utils/Cloning.h"
247
254
#include " llvm/Transforms/Utils/Local.h"
255
+ #include " llvm/Transforms/Utils/LowerMemIntrinsics.h"
248
256
#include " llvm/Transforms/Utils/ValueMapper.h"
249
257
250
258
#define DEBUG_TYPE " amdgpu-lower-buffer-fat-pointers"
@@ -431,14 +439,16 @@ namespace {
431
439
// / marshalling costs when reading or storing these values, but since placing
432
440
// / such pointers into memory is an uncommon operation at best, we feel that
433
441
// / this cost is acceptable for better performance in the common case.
434
- class StoreFatPtrsAsIntsVisitor
435
- : public InstVisitor<StoreFatPtrsAsIntsVisitor , bool > {
442
+ class StoreFatPtrsAsIntsAndExpandMemcpyVisitor
443
+ : public InstVisitor<StoreFatPtrsAsIntsAndExpandMemcpyVisitor , bool > {
436
444
BufferFatPtrToIntTypeMap *TypeMap;
437
445
438
446
ValueToValueMapTy ConvertedForStore;
439
447
440
448
IRBuilder<> IRB;
441
449
450
+ const TargetMachine *TM;
451
+
442
452
// Convert all the buffer fat pointers within the input value to inttegers
443
453
// so that it can be stored in memory.
444
454
Value *fatPtrsToInts (Value *V, Type *From, Type *To, const Twine &Name);
@@ -448,20 +458,27 @@ class StoreFatPtrsAsIntsVisitor
448
458
Value *intsToFatPtrs (Value *V, Type *From, Type *To, const Twine &Name);
449
459
450
460
public:
451
- StoreFatPtrsAsIntsVisitor (BufferFatPtrToIntTypeMap *TypeMap, LLVMContext &Ctx)
452
- : TypeMap(TypeMap), IRB(Ctx) {}
461
+ StoreFatPtrsAsIntsAndExpandMemcpyVisitor (BufferFatPtrToIntTypeMap *TypeMap,
462
+ LLVMContext &Ctx,
463
+ const TargetMachine *TM)
464
+ : TypeMap(TypeMap), IRB(Ctx), TM(TM) {}
453
465
bool processFunction (Function &F);
454
466
455
467
bool visitInstruction (Instruction &I) { return false ; }
456
468
bool visitAllocaInst (AllocaInst &I);
457
469
bool visitLoadInst (LoadInst &LI);
458
470
bool visitStoreInst (StoreInst &SI);
459
471
bool visitGetElementPtrInst (GetElementPtrInst &I);
472
+
473
+ bool visitMemCpyInst (MemCpyInst &MCI);
474
+ bool visitMemMoveInst (MemMoveInst &MMI);
475
+ bool visitMemSetInst (MemSetInst &MSI);
476
+ bool visitMemSetPatternInst (MemSetPatternInst &MSPI);
460
477
};
461
478
} // namespace
462
479
463
- Value *StoreFatPtrsAsIntsVisitor ::fatPtrsToInts (Value *V, Type *From, Type *To,
464
- const Twine &Name) {
480
+ Value *StoreFatPtrsAsIntsAndExpandMemcpyVisitor ::fatPtrsToInts (
481
+ Value *V, Type *From, Type *To, const Twine &Name) {
465
482
if (From == To)
466
483
return V;
467
484
ValueToValueMapTy::iterator Find = ConvertedForStore.find (V);
@@ -498,8 +515,8 @@ Value *StoreFatPtrsAsIntsVisitor::fatPtrsToInts(Value *V, Type *From, Type *To,
498
515
return Ret;
499
516
}
500
517
501
- Value *StoreFatPtrsAsIntsVisitor ::intsToFatPtrs (Value *V, Type *From, Type *To,
502
- const Twine &Name) {
518
+ Value *StoreFatPtrsAsIntsAndExpandMemcpyVisitor ::intsToFatPtrs (
519
+ Value *V, Type *From, Type *To, const Twine &Name) {
503
520
if (From == To)
504
521
return V;
505
522
if (isBufferFatPtrOrVector (To)) {
@@ -531,18 +548,25 @@ Value *StoreFatPtrsAsIntsVisitor::intsToFatPtrs(Value *V, Type *From, Type *To,
531
548
return Ret;
532
549
}
533
550
534
- bool StoreFatPtrsAsIntsVisitor ::processFunction (Function &F) {
551
+ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor ::processFunction (Function &F) {
535
552
bool Changed = false ;
536
- // The visitors will mutate GEPs and allocas, but will push loads and stores
537
- // to the worklist to avoid invalidation.
553
+ // Process memcpy-like instructions after the main iteration because they can
554
+ // invalidate iterators.
555
+ SmallVector<WeakTrackingVH> CanBecomeLoops;
538
556
for (Instruction &I : make_early_inc_range (instructions (F))) {
539
- Changed |= visit (I);
557
+ if (isa<MemTransferInst, MemSetInst, MemSetPatternInst>(I))
558
+ CanBecomeLoops.push_back (&I);
559
+ else
560
+ Changed |= visit (I);
561
+ }
562
+ for (WeakTrackingVH VH : make_early_inc_range (CanBecomeLoops)) {
563
+ Changed |= visit (cast<Instruction>(VH));
540
564
}
541
565
ConvertedForStore.clear ();
542
566
return Changed;
543
567
}
544
568
545
- bool StoreFatPtrsAsIntsVisitor ::visitAllocaInst (AllocaInst &I) {
569
+ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor ::visitAllocaInst (AllocaInst &I) {
546
570
Type *Ty = I.getAllocatedType ();
547
571
Type *NewTy = TypeMap->remapType (Ty);
548
572
if (Ty == NewTy)
@@ -551,7 +575,8 @@ bool StoreFatPtrsAsIntsVisitor::visitAllocaInst(AllocaInst &I) {
551
575
return true ;
552
576
}
553
577
554
- bool StoreFatPtrsAsIntsVisitor::visitGetElementPtrInst (GetElementPtrInst &I) {
578
+ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitGetElementPtrInst (
579
+ GetElementPtrInst &I) {
555
580
Type *Ty = I.getSourceElementType ();
556
581
Type *NewTy = TypeMap->remapType (Ty);
557
582
if (Ty == NewTy)
@@ -563,7 +588,7 @@ bool StoreFatPtrsAsIntsVisitor::visitGetElementPtrInst(GetElementPtrInst &I) {
563
588
return true ;
564
589
}
565
590
566
- bool StoreFatPtrsAsIntsVisitor ::visitLoadInst (LoadInst &LI) {
591
+ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor ::visitLoadInst (LoadInst &LI) {
567
592
Type *Ty = LI.getType ();
568
593
Type *IntTy = TypeMap->remapType (Ty);
569
594
if (Ty == IntTy)
@@ -581,7 +606,7 @@ bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) {
581
606
return true ;
582
607
}
583
608
584
- bool StoreFatPtrsAsIntsVisitor ::visitStoreInst (StoreInst &SI) {
609
+ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor ::visitStoreInst (StoreInst &SI) {
585
610
Value *V = SI.getValueOperand ();
586
611
Type *Ty = V->getType ();
587
612
Type *IntTy = TypeMap->remapType (Ty);
@@ -597,6 +622,47 @@ bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) {
597
622
return true ;
598
623
}
599
624
625
+ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemCpyInst (
626
+ MemCpyInst &MCI) {
627
+ // TODO: Allow memcpy.p7.p3 as a synonym for the direct-to-LDS copy, which'll
628
+ // need loop expansion here.
629
+ if (MCI.getSourceAddressSpace () != AMDGPUAS::BUFFER_FAT_POINTER &&
630
+ MCI.getDestAddressSpace () != AMDGPUAS::BUFFER_FAT_POINTER)
631
+ return false ;
632
+ llvm::expandMemCpyAsLoop (&MCI,
633
+ TM->getTargetTransformInfo (*MCI.getFunction ()));
634
+ MCI.eraseFromParent ();
635
+ return true ;
636
+ }
637
+
638
+ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemMoveInst (
639
+ MemMoveInst &MMI) {
640
+ if (MMI.getSourceAddressSpace () != AMDGPUAS::BUFFER_FAT_POINTER &&
641
+ MMI.getDestAddressSpace () != AMDGPUAS::BUFFER_FAT_POINTER)
642
+ return false ;
643
+ report_fatal_error (
644
+ " memmove() on buffer descriptors is not implemented because pointer "
645
+ " comparison on buffer descriptors isn't implemented\n " );
646
+ }
647
+
648
+ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetInst (
649
+ MemSetInst &MSI) {
650
+ if (MSI.getDestAddressSpace () != AMDGPUAS::BUFFER_FAT_POINTER)
651
+ return false ;
652
+ llvm::expandMemSetAsLoop (&MSI);
653
+ MSI.eraseFromParent ();
654
+ return true ;
655
+ }
656
+
657
+ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetPatternInst (
658
+ MemSetPatternInst &MSPI) {
659
+ if (MSPI.getDestAddressSpace () != AMDGPUAS::BUFFER_FAT_POINTER)
660
+ return false ;
661
+ llvm::expandMemSetPatternAsLoop (&MSPI);
662
+ MSPI.eraseFromParent ();
663
+ return true ;
664
+ }
665
+
600
666
namespace {
601
667
// / Convert loads/stores of types that the buffer intrinsics can't handle into
602
668
// / one ore more such loads/stores that consist of legal types.
@@ -1127,6 +1193,7 @@ bool LegalizeBufferContentTypesVisitor::visitStoreInst(StoreInst &SI) {
1127
1193
1128
1194
bool LegalizeBufferContentTypesVisitor::processFunction (Function &F) {
1129
1195
bool Changed = false ;
1196
+ // Note, memory transfer intrinsics won't
1130
1197
for (Instruction &I : make_early_inc_range (instructions (F))) {
1131
1198
Changed |= visit (I);
1132
1199
}
@@ -2084,6 +2151,12 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
2084
2151
case Intrinsic::invariant_end:
2085
2152
case Intrinsic::launder_invariant_group:
2086
2153
case Intrinsic::strip_invariant_group:
2154
+ case Intrinsic::memcpy:
2155
+ case Intrinsic::memcpy_inline:
2156
+ case Intrinsic::memmove:
2157
+ case Intrinsic::memset:
2158
+ case Intrinsic::memset_inline:
2159
+ case Intrinsic::experimental_memset_pattern:
2087
2160
return true ;
2088
2161
}
2089
2162
}
@@ -2353,7 +2426,8 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
2353
2426
/* RemoveDeadConstants=*/ false , /* IncludeSelf=*/ true );
2354
2427
}
2355
2428
2356
- StoreFatPtrsAsIntsVisitor MemOpsRewrite (&IntTM, M.getContext ());
2429
+ StoreFatPtrsAsIntsAndExpandMemcpyVisitor MemOpsRewrite (&IntTM, M.getContext (),
2430
+ &TM);
2357
2431
LegalizeBufferContentTypesVisitor BufferContentsTypeRewrite (DL,
2358
2432
M.getContext ());
2359
2433
for (Function &F : M.functions ()) {
0 commit comments