Skip to content

simd inline failed when using both _bzhi_u32 and _mm512_maskz_loadu_epi32 #121960

Closed
@silver-ymz

Description

@silver-ymz

I tried this code:

#![feature(stdarch_x86_avx512)]
#![feature(avx512_target_feature)]

use std::arch::x86_64::*;

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx512f")]
pub unsafe fn test1(ptr: *const i32, len: u32) {
    let mask = _bzhi_u32(0xFF, len) as u16;
    std::hint::black_box(_mm512_maskz_loadu_epi32(mask, ptr));
}

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi2")]
#[target_feature(enable = "avx512f")]
pub unsafe fn test2(ptr: *const i32, len: u32) {
    let mask = _bzhi_u32(0xFF, len) as u16;
    std::hint::black_box(_mm512_maskz_loadu_epi32(mask, ptr));
}

I expected to see this happen: Both bzhi and vmovdqu32 can be inlined.

Instead, this happened:

cargo asm
❯ cargo asm 2
    Finished `release` profile [optimized] target(s) in 0.00s

.section .text.test2::test1,"ax",@progbits
	.globl	test2::test1
	.p2align	4, 0x90
	.type	test2::test1,@function
test2::test1:
	.cfi_startproc
	push rbp
	.cfi_def_cfa_offset 16
	.cfi_offset rbp, -16
	mov rbp, rsp
	.cfi_def_cfa_register rbp
	push rbx
	and rsp, -64
	sub rsp, 128
	.cfi_offset rbx, -24
	mov rbx, rdi
	mov edi, esi
	call core::core_arch::x86::bmi2::_bzhi_u32
	kmovw k1, eax
	#APP

	vmovdqu32 zmm0 {k1} {z}, zmmword ptr [rbx]

	#NO_APP
	vmovaps zmmword ptr [rsp], zmm0
	mov rax, rsp
	#APP
	#NO_APP
	lea rsp, [rbp - 8]
	pop rbx
	pop rbp
	.cfi_def_cfa rsp, 8
	vzeroupper
	ret

❯ cargo asm 3
    Finished `release` profile [optimized] target(s) in 0.00s

.section .text.test2::test2,"ax",@progbits
	.globl	test2::test2
	.p2align	4, 0x90
	.type	test2::test2,@function
test2::test2:
	.cfi_startproc
	push rbp
	.cfi_def_cfa_offset 16
	.cfi_offset rbp, -16
	mov rbp, rsp
	.cfi_def_cfa_register rbp
	push rbx
	and rsp, -64
	sub rsp, 128
	.cfi_offset rbx, -24
	mov rdx, rdi
	mov eax, 255
	bzhi esi, eax, esi
	mov rbx, rsp
	mov rdi, rbx
	call core::core_arch::x86::avx512f::_mm512_maskz_loadu_epi32
	#APP
	#NO_APP
	lea rsp, [rbp - 8]
	pop rbx
	pop rbp
	.cfi_def_cfa rsp, 8
	ret

When using RUSTFLAGS="-C target-cpu=native", it works well.
Also, when I replace _mm512_maskz_loadu_epi32 with _mm512_loadu_epi32, it works well.

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi2")]
#[target_feature(enable = "avx512f")]
pub unsafe fn test3(ptr: *const i32, len: u32) {
    let mask = _bzhi_u32(0xFFFF, len) as u16;
    std::hint::black_box(mask);
    std::hint::black_box(_mm512_loadu_epi32(ptr));
}
cargo asm
❯ cargo asm 4    
    Finished `release` profile [optimized] target(s) in 0.00s

.section .text.test2::test3,"ax",@progbits
	.globl	test2::test3
	.p2align	4, 0x90
	.type	test2::test3,@function
test2::test3:
	.cfi_startproc
	push rbp
	.cfi_def_cfa_offset 16
	.cfi_offset rbp, -16
	mov rbp, rsp
	.cfi_def_cfa_register rbp
	and rsp, -64
	sub rsp, 128
	mov eax, 65535
	bzhi eax, eax, esi
	mov word ptr [rsp], ax
	mov rax, rsp
	#APP
	#NO_APP
	vmovups zmm0, zmmword ptr [rdi]
	vmovaps zmmword ptr [rsp], zmm0
	mov rax, rsp
	#APP
	#NO_APP
	mov rsp, rbp
	pop rbp
	.cfi_def_cfa rsp, 8
	vzeroupper
	ret

Meta

rustc --version --verbose:

rustc 1.78.0-nightly (3246e7951 2024-02-19)
binary: rustc
commit-hash: 3246e79513cb89ddbfc0f21cb5a877e5b321dcc5
commit-date: 2024-02-19
host: x86_64-unknown-linux-gnu
release: 1.78.0-nightly
LLVM version: 18.1.0

Metadata

Metadata

Assignees

Labels

A-LLVMArea: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues.A-codegenArea: Code generationC-bugCategory: This is a bug.I-slowIssue: Problems and improvements with respect to performance of generated code.O-x86_64Target: x86-64 processors (like x86_64-*) (also known as amd64 and x64)llvm-fixed-upstreamIssue expected to be fixed by the next major LLVM upgrade, or backported fixes

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions