[Matrix] Propagate shape information through cast insts #141869

jroelofs · 2025-05-28T22:35:08Z

No description provided.

llvmbot · 2025-05-28T22:35:45Z

@llvm/pr-subscribers-llvm-transforms

Author: Jon Roelofs (jroelofs)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/141869.diff

3 Files Affected:

(modified) llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp (+94-4)
(modified) llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll (+43)
(added) llvm/test/Transforms/LowerMatrixIntrinsics/unary.ll (+202)

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 756a72e6d97bc..38e777601e31c 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -32,8 +32,10 @@
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/MatrixBuilder.h"
@@ -229,9 +231,18 @@ static bool isUniformShape(Value *V) {
   if (!I)
     return true;
 
-  if (I->isBinaryOp())
+  if (I->isBinaryOp() || I->isCast())
     return true;
 
+  if (auto *II = dyn_cast<IntrinsicInst>(V))
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::abs:
+    case Intrinsic::fabs:
+      return true;
+    default:
+      return false;
+    }
+
   switch (I->getOpcode()) {
   case Instruction::FNeg:
     return true;
@@ -621,7 +632,7 @@ class LowerMatrixIntrinsics {
       case Intrinsic::matrix_column_major_store:
         return true;
       default:
-        return false;
+        return isUniformShape(II);
       }
     return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V);
   }
@@ -1066,9 +1077,11 @@ class LowerMatrixIntrinsics {
       Value *Op2;
       if (auto *BinOp = dyn_cast<BinaryOperator>(Inst))
         Changed |= VisitBinaryOperator(BinOp);
-      if (auto *UnOp = dyn_cast<UnaryOperator>(Inst))
+      else if (auto *UnOp = dyn_cast<UnaryOperator>(Inst))
         Changed |= VisitUnaryOperator(UnOp);
-      if (match(Inst, m_Load(m_Value(Op1))))
+      else if (auto *Cast = dyn_cast<CastInst>(Inst))
+        Changed |= VisitCastInstruction(Cast);
+      else if (match(Inst, m_Load(m_Value(Op1))))
         Changed |= VisitLoad(cast<LoadInst>(Inst), Op1, Builder);
       else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2))))
         Changed |= VisitStore(cast<StoreInst>(Inst), Op1, Op2, Builder);
@@ -1127,6 +1140,9 @@ class LowerMatrixIntrinsics {
     case Intrinsic::matrix_column_major_store:
       LowerColumnMajorStore(Inst);
       break;
+    case Intrinsic::abs:
+    case Intrinsic::fabs:
+      return VisitUniformIntrinsic(cast<IntrinsicInst>(Inst));
     default:
       return false;
     }
@@ -2198,6 +2214,80 @@ class LowerMatrixIntrinsics {
     return true;
   }
 
+  /// Lower cast instructions, if shape information is available.
+  bool VisitCastInstruction(CastInst *Inst) {
+    auto I = ShapeMap.find(Inst);
+    if (I == ShapeMap.end())
+      return false;
+
+    Value *Op = Inst->getOperand(0);
+
+    IRBuilder<> Builder(Inst);
+    ShapeInfo &Shape = I->second;
+
+    MatrixTy Result;
+    MatrixTy M = getMatrix(Op, Shape, Builder);
+
+    Builder.setFastMathFlags(getFastMathFlags(Inst));
+
+    for (unsigned I = 0; I < Shape.getNumVectors(); ++I) {
+      auto *OrigTy = cast<VectorType>(Inst->getType());
+      auto *NewTy = VectorType::get(OrigTy->getElementType(),
+                                    ElementCount::getFixed(M.getStride()));
+      Result.addVector(
+          Builder.CreateCast(Inst->getOpcode(), M.getVector(I), NewTy));
+    }
+
+    finalizeLowering(Inst,
+                     Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
+                                             Result.getNumVectors()),
+                     Builder);
+    return true;
+  }
+
+  /// Lower uniform shape intrinsics, if shape information is available.
+  bool VisitUniformIntrinsic(IntrinsicInst *Inst) {
+    auto I = ShapeMap.find(Inst);
+    if (I == ShapeMap.end())
+      return false;
+
+    IRBuilder<> Builder(Inst);
+    ShapeInfo &Shape = I->second;
+
+    MatrixTy Result;
+
+    switch (Inst->getIntrinsicID()) {
+    case Intrinsic::abs:
+    case Intrinsic::fabs: {
+      Value *Op = Inst->getOperand(0);
+
+      MatrixTy M = getMatrix(Op, Shape, Builder);
+
+      Builder.setFastMathFlags(getFastMathFlags(Inst));
+
+      for (unsigned I = 0; I < Shape.getNumVectors(); ++I)
+        switch (Inst->getIntrinsicID()) {
+        case Intrinsic::abs:
+          Result.addVector(Builder.CreateBinaryIntrinsic(
+              Intrinsic::abs, M.getVector(I), Inst->getOperand(1)));
+          break;
+        case Intrinsic::fabs:
+          Result.addVector(Builder.CreateUnaryIntrinsic(Inst->getIntrinsicID(),
+                                                        M.getVector(I)));
+          break;
+        }
+
+      finalizeLowering(Inst,
+                       Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
+                                               Result.getNumVectors()),
+                       Builder);
+      return true;
+    }
+    default:
+      llvm_unreachable("unexpected intrinsic");
+    }
+  }
+
   /// Helper to linearize a matrix expression tree into a string. Currently
   /// matrix expressions are linarized by starting at an expression leaf and
   /// linearizing bottom up.
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll
index 9160ced2715aa..48974a951c3bb 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll
@@ -432,3 +432,46 @@ define void @xor_2x2(ptr %lhs, ptr %rhs, ptr %out) {
   store <4 x i32> %optt, ptr %out
   ret void
 }
+
+define void @fabs_2x2f64(ptr %in, ptr %out) {
+; CHECK-LABEL: @fabs_2x2f64(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[IN:%.*]], align 32
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[COL_LOAD]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[COL_LOAD1]])
+; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 32
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[VEC_GEP2]], align 16
+; CHECK-NEXT:    ret void
+;
+  %load = load <4 x double>, ptr %in
+  %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %load)
+  %fabst  = call <4 x double> @llvm.matrix.transpose(<4 x double> %fabs, i32 2, i32 2)
+  %fabstt = call <4 x double> @llvm.matrix.transpose(<4 x double> %fabst, i32 2, i32 2)
+  store <4 x double> %fabstt, ptr %out
+  ret void
+}
+
+define void @fabs_2x2i32(ptr %in, ptr %out) {
+; CHECK-LABEL: @fabs_2x2i32(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x i32>, ptr [[IN:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i32, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x i32>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[COL_LOAD]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[COL_LOAD1]], i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[TMP1]], i1 true)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[TMP2]], i1 true)
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x i32> [[TMP4]], ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    ret void
+;
+  %load = load <4 x i32>, ptr %in
+  %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %load, i1 false)
+  %abst  = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %abs, i32 2, i32 2)
+  %abstt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %abst, i32 2, i32 2)
+  %absabstt = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %abstt, i1 true)
+  store <4 x i32> %absabstt, ptr %out
+  ret void
+}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/unary.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/unary.ll
new file mode 100644
index 0000000000000..b73c1402044bc
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/unary.ll
@@ -0,0 +1,202 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
+
+define void @fneg_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @fneg_2x2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, ptr [[IN:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <2 x float> [[COL_LOAD]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg <2 x float> [[COL_LOAD1]]
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr float, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    ret void
+;
+  %inv = load <4 x float>, ptr %in
+  %op = fneg <4 x float> %inv
+  %opt  = call <4 x float> @llvm.matrix.transpose(<4 x float> %op, i32 2, i32 2)
+  %optt = call <4 x float> @llvm.matrix.transpose(<4 x float> %opt, i32 2, i32 2)
+  store <4 x float> %optt, ptr %out
+  ret void
+}
+
+define void @trunc_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @trunc_2x2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x i64>, ptr [[IN:%.*]], align 32
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i64, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x i64>, ptr [[VEC_GEP]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i64> [[COL_LOAD]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <2 x i64> [[COL_LOAD1]] to <2 x i32>
+; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    ret void
+;
+  %inv = load <4 x i64>, ptr %in
+  %op = trunc <4 x i64> %inv to <4 x i32>
+  %opt  = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2)
+  %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2)
+  store <4 x i32> %optt, ptr %out
+  ret void
+}
+
+define void @zext_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @zext_2x2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x i16>, ptr [[IN:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i16, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x i16>, ptr [[VEC_GEP]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[COL_LOAD]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <2 x i16> [[COL_LOAD1]] to <2 x i32>
+; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    ret void
+;
+  %inv = load <4 x i16>, ptr %in
+  %op = zext <4 x i16> %inv to <4 x i32>
+  %opt  = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2)
+  %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2)
+  store <4 x i32> %optt, ptr %out
+  ret void
+}
+
+define void @sext_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @sext_2x2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x i8>, ptr [[IN:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x i8>, ptr [[VEC_GEP]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <2 x i8> [[COL_LOAD]] to <2 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i8> [[COL_LOAD1]] to <2 x i16>
+; CHECK-NEXT:    store <2 x i16> [[TMP1]], ptr [[OUT:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr i16, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x i16> [[TMP2]], ptr [[VEC_GEP2]], align 4
+; CHECK-NEXT:    ret void
+;
+  %inv = load <4 x i8>, ptr %in
+  %op = sext <4 x i8> %inv to <4 x i16>
+  %opt  = call <4 x i16> @llvm.matrix.transpose(<4 x i16> %op, i32 2, i32 2)
+  %optt = call <4 x i16> @llvm.matrix.transpose(<4 x i16> %opt, i32 2, i32 2)
+  store <4 x i16> %optt, ptr %out
+  ret void
+}
+
+define void @fptoui_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @fptoui_2x2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, ptr [[IN:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = fptoui <2 x float> [[COL_LOAD]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = fptoui <2 x float> [[COL_LOAD1]] to <2 x i32>
+; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    ret void
+;
+  %inv = load <4 x float>, ptr %in
+  %op = fptoui <4 x float> %inv to <4 x i32>
+  %opt  = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2)
+  %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2)
+  store <4 x i32> %optt, ptr %out
+  ret void
+}
+
+define void @fptosi_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @fptosi_2x2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, ptr [[IN:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = fptosi <2 x float> [[COL_LOAD]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi <2 x float> [[COL_LOAD1]] to <2 x i32>
+; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    ret void
+;
+  %inv = load <4 x float>, ptr %in
+  %op = fptosi <4 x float> %inv to <4 x i32>
+  %opt  = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2)
+  %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2)
+  store <4 x i32> %optt, ptr %out
+  ret void
+}
+
+define void @uitofp_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @uitofp_2x2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x i64>, ptr [[IN:%.*]], align 32
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i64, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x i64>, ptr [[VEC_GEP]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = uitofp <2 x i64> [[COL_LOAD]] to <2 x double>
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[COL_LOAD1]] to <2 x double>
+; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 32
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[VEC_GEP2]], align 16
+; CHECK-NEXT:    ret void
+;
+  %inv = load <4 x i64>, ptr %in
+  %op = uitofp <4 x i64> %inv to <4 x double>
+  %opt  = call <4  x double> @llvm.matrix.transpose(<4  x double> %op, i32 2, i32 2)
+  %optt = call <4  x double> @llvm.matrix.transpose(<4  x double> %opt, i32 2, i32 2)
+  store <4  x double> %optt, ptr %out
+  ret void
+}
+
+define void @sitofp_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @sitofp_2x2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x i64>, ptr [[IN:%.*]], align 32
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i64, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x i64>, ptr [[VEC_GEP]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <2 x i64> [[COL_LOAD]] to <2 x double>
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i64> [[COL_LOAD1]] to <2 x double>
+; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 32
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[VEC_GEP2]], align 16
+; CHECK-NEXT:    ret void
+;
+  %inv = load <4 x i64>, ptr %in
+  %op = sitofp <4 x i64> %inv to <4 x double>
+  %opt  = call <4  x double> @llvm.matrix.transpose(<4  x double> %op, i32 2, i32 2)
+  %optt = call <4  x double> @llvm.matrix.transpose(<4  x double> %opt, i32 2, i32 2)
+  store <4  x double> %optt, ptr %out
+  ret void
+}
+
+define void @fptrunc_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @fptrunc_2x2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[IN:%.*]], align 32
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc nnan <2 x double> [[COL_LOAD]] to <2 x float>
+; CHECK-NEXT:    [[TMP2:%.*]] = fptrunc nnan <2 x double> [[COL_LOAD1]] to <2 x float>
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr float, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    ret void
+;
+  %inv = load <4 x double>, ptr %in
+  %op = fptrunc nnan <4 x double> %inv to <4 x float>
+  %opt  = call <4 x float> @llvm.matrix.transpose(<4 x float> %op, i32 2, i32 2)
+  %optt = call <4 x float> @llvm.matrix.transpose(<4 x float> %opt, i32 2, i32 2)
+  store <4 x float> %optt, ptr %out
+  ret void
+}
+
+define void @fpext_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @fpext_2x2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, ptr [[IN:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext <2 x float> [[COL_LOAD]] to <2 x double>
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext <2 x float> [[COL_LOAD1]] to <2 x double>
+; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 32
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[VEC_GEP2]], align 16
+; CHECK-NEXT:    ret void
+;
+  %inv = load <4 x float>, ptr %in
+  %op = fpext <4 x float> %inv to <4 x double>
+  %opt  = call <4 x double> @llvm.matrix.transpose(<4 x double> %op, i32 2, i32 2)
+  %optt = call <4 x double> @llvm.matrix.transpose(<4 x double> %opt, i32 2, i32 2)
+  store <4 x double> %optt, ptr %out
+  ret void
+}

jroelofs · 2025-05-28T22:36:17Z

~~This is a stacked PR. Its counterpart should be merged first: #141704~~

edit: I've un-stacked them.

llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp

github-actions · 2025-05-29T00:52:02Z

✅ With the latest revision this PR passed the C/C++ code formatter.

llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp

llvm/test/Transforms/LowerMatrixIntrinsics/unary.ll

jroelofs requested review from fhahn and anemet May 28, 2025 22:35

llvmbot added the llvm:transforms label May 28, 2025

jroelofs force-pushed the jroelofs/lower-matrix-fpext branch 2 times, most recently from 3296ce5 to 42a624a Compare May 29, 2025 00:48

jroelofs commented May 29, 2025

View reviewed changes

llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp Outdated Show resolved Hide resolved

jroelofs force-pushed the jroelofs/lower-matrix-fpext branch from 42a624a to c3fddfe Compare May 29, 2025 01:41

jroelofs changed the title ~~[Matrix] Propagate shape information through cast instructions~~ [Matrix] Propagate shape information through cast insts May 29, 2025

fhahn reviewed May 29, 2025

View reviewed changes

llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp Outdated Show resolved Hide resolved

llvm/test/Transforms/LowerMatrixIntrinsics/unary.ll Outdated Show resolved Hide resolved

[Matrix] Propagate shape information through cast instructions

3779c5b

jroelofs force-pushed the jroelofs/lower-matrix-fpext branch from c3fddfe to 3779c5b Compare May 29, 2025 15:05

jroelofs added 4 commits May 29, 2025 08:14

use llvm.column.major.store for shape info

f2f146f

ensure switch is fully covered

5143403

add a test for bitcast <4 x double> %inv to <8 x i32>

278dbd1

and a test for bitcast <4 x double> %inv to i256

2d10a4f

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[Matrix] Propagate shape information through cast insts #141869

[Matrix] Propagate shape information through cast insts #141869

jroelofs commented May 28, 2025

Uh oh!

llvmbot commented May 28, 2025

Uh oh!

jroelofs commented May 28, 2025 •

edited

Loading

Uh oh!

Uh oh!

github-actions bot commented May 29, 2025 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

Uh oh!

[Matrix] Propagate shape information through cast insts #141869

Are you sure you want to change the base?

[Matrix] Propagate shape information through cast insts #141869

Conversation

jroelofs commented May 28, 2025

Uh oh!

llvmbot commented May 28, 2025

Uh oh!

jroelofs commented May 28, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

github-actions bot commented May 29, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

jroelofs commented May 28, 2025 •

edited

Loading

github-actions bot commented May 29, 2025 •

edited

Loading