diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 52cb1dbb33b86..1356ff9e56adb 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -128,6 +128,7 @@ class VectorCombine { bool foldShuffleOfShuffles(Instruction &I); bool foldShuffleOfIntrinsics(Instruction &I); bool foldShuffleToIdentity(Instruction &I); + bool foldShuffleExt(Instruction &I); bool foldShuffleFromReductions(Instruction &I); bool foldCastFromReductions(Instruction &I); bool foldSelectShuffle(Instruction &I, bool FromReduction = false); @@ -2791,6 +2792,60 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { return true; } +bool VectorCombine::foldShuffleExt(Instruction &I) { + // Try to fold vector zero- and sign-extends split across multiple operations + // into a single extend. + + // Check if we have ZEXT/SEXT (SHUFFLE (ZEXT/SEXT %src), _, identity-mask), + // with an identity mask extracting the first sub-vector. + Value *Src; + ArrayRef Mask; + if (!match(&I, m_OneUse(m_Shuffle(m_OneUse(m_ZExtOrSExt(m_Value(Src))), + m_Value(), m_Mask(Mask)))) || + !cast(&I)->isIdentityWithExtract()) + return false; + auto *InnerExt = cast(I.getOperand(0)); + auto *OuterExt = cast(*I.user_begin()); + if (!isa(OuterExt)) + return false; + + // If the inner extend is a sign extend and the outer one isnt (i.e. a + // zero-extend), don't fold. If the first one is zero-extend, it doesn't + // matter if the second one is a sign- or zero-extend. + if (isa(InnerExt) && !isa(OuterExt)) + return false; + + auto *DstTy = cast(OuterExt->getType()); + auto *SrcTy = + FixedVectorType::get(InnerExt->getOperand(0)->getType()->getScalarType(), + DstTy->getNumElements()); + + // Don't perform the fold if the cost of the new extend is worse than the cost + // of the 2 original extends. + InstructionCost OriginalCost = + TTI.getCastInstrCost(InnerExt->getOpcode(), SrcTy, InnerExt->getType(), + TTI::CastContextHint::None) + + TTI.getCastInstrCost(InnerExt->getOpcode(), SrcTy, InnerExt->getType(), + TTI::CastContextHint::None); + InstructionCost NewCost = TTI.getCastInstrCost( + InnerExt->getOpcode(), SrcTy, DstTy, TTI::CastContextHint::None); + if (NewCost > OriginalCost) + return false; + + // Convert to a shuffle of the input feeding a single wide extend. + Builder.SetInsertPoint(*OuterExt->getInsertionPointAfterDef()); + auto *NewIns = + Builder.CreateShuffleVector(Src, PoisonValue::get(Src->getType()), Mask); + auto *NewExt = + isa(InnerExt) + ? Builder.CreateZExt(NewIns, DstTy, "vec.ext", InnerExt->hasNonNeg()) + : Builder.CreateSExt(NewIns, DstTy, "vec.ext"); + OuterExt->replaceAllUsesWith(NewExt); + replaceValue(*OuterExt, *NewExt); + Worklist.pushValue(NewExt); + return true; +} + /// Given a commutative reduction, the order of the input lanes does not alter /// the results. We can use this to remove certain shuffles feeding the /// reduction, removing the need to shuffle at all. @@ -3565,6 +3620,7 @@ bool VectorCombine::run() { break; case Instruction::ShuffleVector: MadeChange |= widenSubvectorLoad(I); + MadeChange |= foldShuffleExt(I); break; default: break; diff --git a/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll b/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll index 6341c8945247d..9ac3655f3e59d 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll @@ -14,9 +14,8 @@ define <4 x i32> @load_i32_zext_to_v4i32(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[EXT_2:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: @@ -36,9 +35,8 @@ define <4 x i32> @load_i32_zext_to_v4i32_both_nneg(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[E_1:%.*]] = zext nneg <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i8> [[TMP0]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: @@ -58,9 +56,8 @@ define <4 x i32> @load_i32_zext_to_v4i32_inner_nneg(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[E_1:%.*]] = zext nneg <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i8> [[TMP0]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: @@ -80,9 +77,8 @@ define <4 x i32> @load_i32_zext_to_v4i32_outer_nneg(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[EXT_2:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: @@ -102,9 +98,8 @@ define <4 x i32> @load_i32_zext_to_v4i32_inner_nneg_outer_sext(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[E_1:%.*]] = zext nneg <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = sext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i8> [[TMP0]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: @@ -125,9 +120,8 @@ define <4 x i32> @load_i32_zext_to_v4i32_clobber_after_load(ptr %di) { ; CHECK-NEXT: call void @use.i32(i32 0) ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[EXT_2:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: @@ -170,9 +164,8 @@ define <4 x i32> @load_i32_zext_to_v4i32_load_other_users(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[EXT_2:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32> ; CHECK-NEXT: call void @use.i32(i32 [[L]]) ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; @@ -194,9 +187,8 @@ define <4 x i32> @load_i32_zext_to_v4i32_ins_other_users(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[EXT_2:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32> ; CHECK-NEXT: call void @use.v2i32(<2 x i32> [[VEC_INS]]) ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; @@ -218,9 +210,8 @@ define <4 x i32> @load_i32_zext_to_v4i32_bc_other_users(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[EXT_2:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32> ; CHECK-NEXT: call void @use.v8i8(<8 x i8> [[VEC_BC]]) ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; @@ -290,9 +281,8 @@ define <8 x i32> @load_i64_zext_to_v8i32(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[DI]], align 8 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i64> , i64 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i64> [[VEC_INS]] to <16 x i8> -; CHECK-NEXT: [[EXT_1:%.*]] = zext <16 x i8> [[VEC_BC]] to <16 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <16 x i16> [[EXT_1]], <16 x i16> poison, <8 x i32> -; CHECK-NEXT: [[OUTER_EXT:%.*]] = zext nneg <8 x i16> [[VEC_SHUFFLE]] to <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[VEC_BC]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[OUTER_EXT:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[OUTER_EXT]] ; entry: @@ -312,9 +302,8 @@ define <3 x i32> @load_i24_zext_to_v3i32(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i24, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i24> , i24 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i24> [[VEC_INS]] to <6 x i8> -; CHECK-NEXT: [[EXT_1:%.*]] = zext <6 x i8> [[VEC_BC]] to <6 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <6 x i16> [[EXT_1]], <6 x i16> poison, <3 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <3 x i16> [[VEC_SHUFFLE]] to <3 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <6 x i8> [[VEC_BC]], <6 x i8> poison, <3 x i32> +; CHECK-NEXT: [[EXT_2:%.*]] = zext <3 x i8> [[TMP0]] to <3 x i32> ; CHECK-NEXT: ret <3 x i32> [[EXT_2]] ; entry: @@ -334,9 +323,8 @@ define <4 x i32> @load_i32_insert_idx_1_sext(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 1 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[EXT_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[EXT_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[EXT_2:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: @@ -422,9 +410,8 @@ define <4 x i32> @load_i32_sext_to_v4i32(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[E_1:%.*]] = sext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = sext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[EXT_2:%.*]] = sext <4 x i8> [[TMP0]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: @@ -444,9 +431,8 @@ define <8 x i32> @load_i64_sext_to_v8i32(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[DI]], align 8 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i64> , i64 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i64> [[VEC_INS]] to <16 x i8> -; CHECK-NEXT: [[EXT_1:%.*]] = sext <16 x i8> [[VEC_BC]] to <16 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <16 x i16> [[EXT_1]], <16 x i16> poison, <8 x i32> -; CHECK-NEXT: [[OUTER_EXT:%.*]] = sext <8 x i16> [[VEC_SHUFFLE]] to <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[VEC_BC]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[OUTER_EXT:%.*]] = sext <8 x i8> [[TMP0]] to <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[OUTER_EXT]] ; entry: @@ -466,9 +452,8 @@ define <3 x i32> @load_i24_sext_to_v3i32(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i24, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i24> , i24 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i24> [[VEC_INS]] to <6 x i8> -; CHECK-NEXT: [[EXT_1:%.*]] = sext <6 x i8> [[VEC_BC]] to <6 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <6 x i16> [[EXT_1]], <6 x i16> poison, <3 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = sext <3 x i16> [[VEC_SHUFFLE]] to <3 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <6 x i8> [[VEC_BC]], <6 x i8> poison, <3 x i32> +; CHECK-NEXT: [[EXT_2:%.*]] = sext <3 x i8> [[TMP0]] to <3 x i32> ; CHECK-NEXT: ret <3 x i32> [[EXT_2]] ; entry: @@ -488,9 +473,8 @@ define <4 x i32> @load_i32_insert_idx_1(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 1 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[EXT_1:%.*]] = sext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[EXT_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = sext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[EXT_2:%.*]] = sext <4 x i8> [[TMP0]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: