[Matrix] Optimize static extracts with ShapeInfo

jroelofs · jroelofs · commit 53098ce18f74 · 2025-05-28T11:10:27.000-07:00
For ExtractElementInsts with static indices that extract from a Matrix, use the
known layout of the Rows/Columns to look through the shuffles that
embedInVector creates, which in some cases allows us to delete them.
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -34,6 +34,7 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/MatrixBuilder.h"
@@ -568,6 +569,7 @@ class LowerMatrixIntrinsics {
         return M;
 
       MatrixVal = M.embedInVector(Builder);
+      Inst2ColumnMatrix[MatrixVal] = M;
     }
 
     // Otherwise split MatrixVal.
@@ -632,7 +634,7 @@ class LowerMatrixIntrinsics {
       default:
         return isUniformShape(II);
       }
-    return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V);
+    return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V) || isa<ExtractElementInst>(V);
   }
 
   /// Propagate the shape information of instructions to their users.
@@ -1083,6 +1085,18 @@ class LowerMatrixIntrinsics {
         Changed |= VisitStore(cast<StoreInst>(Inst), Op1, Op2, Builder);
     }
 
+    // Fifth, lower instructions which can make use of shape information, but do
+    // not have shapes themselves.
+    for (auto *BB : RPOT)
+      for (Instruction &Inst : *BB) {
+        IRBuilder<> Builder(&Inst);
+
+        Value *Op1;
+        uint64_t Index;
+        if (match(&Inst, m_ExtractElt(m_Value(Op1), m_ConstantInt(Index))))
+          Changed |= VisitExtractElt(cast<ExtractElementInst>(&Inst), Index);
+      }
+
     if (ORE) {
       RemarkGenerator RemarkGen(Inst2ColumnMatrix, *ORE, Func);
       RemarkGen.emitRemarks();
@@ -1364,8 +1378,10 @@ class LowerMatrixIntrinsics {
     Value *Flattened = nullptr;
     for (Use &U : llvm::make_early_inc_range(Inst->uses())) {
       if (!ShapeMap.contains(U.getUser())) {
-        if (!Flattened)
+        if (!Flattened) {
           Flattened = Matrix.embedInVector(Builder);
+          Inst2ColumnMatrix[Flattened] = Matrix;
+        }
         U.set(Flattened);
       }
     }
@@ -2142,6 +2158,30 @@ class LowerMatrixIntrinsics {
     return true;
   }
 
+  bool VisitExtractElt(ExtractElementInst *Inst, uint64_t Index) {
+    Value *Op0 = Inst->getOperand(0);
+    auto *VTy = cast<VectorType>(Op0->getType());
+
+    if (VTy->getElementCount().getKnownMinValue() < Index) {
+      Inst->replaceAllUsesWith(PoisonValue::get(VTy->getElementType()));
+      ToRemove.push_back(Inst);
+      return true;
+    }
+
+    auto *I = Inst2ColumnMatrix.find(Op0);
+    if (I == Inst2ColumnMatrix.end())
+      return false;
+
+    const MatrixTy &M = I->second;
+
+    IRBuilder<> Builder(Inst);
+    Inst->setOperand(0, M.getVector(Index / M.getStride()));
+    Inst->setOperand(1, Builder.getInt32(Index % M.getStride()));
+    if (Op0->use_empty() && isa<Instruction>(Op0))
+      ToRemove.push_back(cast<Instruction>(Op0));
+    return true;
+  }
+
   /// Lower binary operators, if shape information is available.
   bool VisitBinaryOperator(BinaryOperator *Inst) {
     auto I = ShapeMap.find(Inst);
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/extract.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/extract.ll
@@ -6,14 +6,28 @@ define float @extract_static(ptr %in, ptr %out) {
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, ptr [[IN:%.*]], align 16
 ; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 2
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> [[COL_LOAD1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x float> [[COL_LOAD1]], i32 1
 ; CHECK-NEXT:    ret float [[EXTRACT]]
 ;
   %inv = load <4 x float>, ptr %in
   %invt  = call <4 x float> @llvm.matrix.transpose(<4 x float> %inv, i32 2, i32 2)
   %invtt = call <4 x float> @llvm.matrix.transpose(<4 x float> %invt, i32 2, i32 2)
-  %extract = extractelement <4 x float> %invtt, i32 0
+  %extract = extractelement <4 x float> %invtt, i32 3
+  ret float %extract
+}
+
+define float @extract_static_outofbounds(ptr %in, ptr %out) {
+; CHECK-LABEL: @extract_static_outofbounds(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, ptr [[IN:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> [[COL_LOAD1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret float poison
+;
+  %inv = load <4 x float>, ptr %in
+  %invt  = call <4 x float> @llvm.matrix.transpose(<4 x float> %inv, i32 2, i32 2)
+  %invtt = call <4 x float> @llvm.matrix.transpose(<4 x float> %invt, i32 2, i32 2)
+  %extract = extractelement <4 x float> %invtt, i32 5
   ret float %extract
 }
 
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-opts.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-opts.ll
@@ -55,11 +55,11 @@ define void @multiply_ntt(ptr %A, ptr %B, ptr %C, ptr %R) {
 ; REMARK-NEXT: Function:        multiply_ntt
 ; REMARK-NEXT: Args:
 ; REMARK-NEXT:   - String:          'Lowered with '
-; REMARK-NEXT:   - NumStores:       '4'
+; REMARK-NEXT:   - NumStores:       '0'
 ; REMARK-NEXT:   - String:          ' stores, '
-; REMARK-NEXT:   - NumLoads:        '10'
+; REMARK-NEXT:   - NumLoads:        '3'
 ; REMARK-NEXT:   - String:          ' loads, '
-; REMARK-NEXT:   - NumComputeOps:   '38'
+; REMARK-NEXT:   - NumComputeOps:   '0'
 ; REMARK-NEXT:   - String:          ' compute ops, '
 ; REMARK-NEXT:   - NumExposedTransposes: '0'
 ; REMARK-NEXT:   - String:          ' exposed transposes'
@@ -443,11 +443,11 @@ define void @multiply_nt_t(ptr %A, ptr %B, ptr %C) {
 ; REMARK-NEXT: Function:        multiply_nt_t
 ; REMARK-NEXT: Args:
 ; REMARK-NEXT:   - String:          'Lowered with '
-; REMARK-NEXT:   - NumStores:       '4'
+; REMARK-NEXT:   - NumStores:       '0'
 ; REMARK-NEXT:   - String:          ' stores, '
-; REMARK-NEXT:   - NumLoads:        '9'
+; REMARK-NEXT:   - NumLoads:        '3'
 ; REMARK-NEXT:   - String:          ' loads, '
-; REMARK-NEXT:   - NumComputeOps:   '20'
+; REMARK-NEXT:   - NumComputeOps:   '0'
 ; REMARK-NEXT:   - String:          ' compute ops, '
 ; REMARK-NEXT:   - NumExposedTransposes: '0'
 ; REMARK-NEXT:   - String:          ' exposed transposes'
@@ -578,11 +578,11 @@ define void @multiply_ntt_t(ptr %A, ptr %B, ptr %C, ptr %R) {
 ; REMARK-NEXT: Function:        multiply_ntt_t
 ; REMARK-NEXT: Args:
 ; REMARK-NEXT:   - String:          'Lowered with '
-; REMARK-NEXT:   - NumStores:       '6'
+; REMARK-NEXT:   - NumStores:       '0'
 ; REMARK-NEXT:   - String:          ' stores, '
-; REMARK-NEXT:   - NumLoads:        '18'
+; REMARK-NEXT:   - NumLoads:        '6'
 ; REMARK-NEXT:   - String:          ' loads, '
-; REMARK-NEXT:   - NumComputeOps:   '60'
+; REMARK-NEXT:   - NumComputeOps:   '0'
 ; REMARK-NEXT:   - String:          ' compute ops, '
 ; REMARK-NEXT:   - NumExposedTransposes: '0'
 ; REMARK-NEXT:   - String:          ' exposed transposes'