Closed
Description
I tried this code:
#![feature(stdarch_x86_avx512)]
#![feature(avx512_target_feature)]
use std::arch::x86_64::*;
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx512f")]
pub unsafe fn test1(ptr: *const i32, len: u32) {
let mask = _bzhi_u32(0xFF, len) as u16;
std::hint::black_box(_mm512_maskz_loadu_epi32(mask, ptr));
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi2")]
#[target_feature(enable = "avx512f")]
pub unsafe fn test2(ptr: *const i32, len: u32) {
let mask = _bzhi_u32(0xFF, len) as u16;
std::hint::black_box(_mm512_maskz_loadu_epi32(mask, ptr));
}
I expected to see this happen: Both bzhi
and vmovdqu32
can be inlined.
Instead, this happened:
cargo asm
❯ cargo asm 2
Finished `release` profile [optimized] target(s) in 0.00s
.section .text.test2::test1,"ax",@progbits
.globl test2::test1
.p2align 4, 0x90
.type test2::test1,@function
test2::test1:
.cfi_startproc
push rbp
.cfi_def_cfa_offset 16
.cfi_offset rbp, -16
mov rbp, rsp
.cfi_def_cfa_register rbp
push rbx
and rsp, -64
sub rsp, 128
.cfi_offset rbx, -24
mov rbx, rdi
mov edi, esi
call core::core_arch::x86::bmi2::_bzhi_u32
kmovw k1, eax
#APP
vmovdqu32 zmm0 {k1} {z}, zmmword ptr [rbx]
#NO_APP
vmovaps zmmword ptr [rsp], zmm0
mov rax, rsp
#APP
#NO_APP
lea rsp, [rbp - 8]
pop rbx
pop rbp
.cfi_def_cfa rsp, 8
vzeroupper
ret
❯ cargo asm 3
Finished `release` profile [optimized] target(s) in 0.00s
.section .text.test2::test2,"ax",@progbits
.globl test2::test2
.p2align 4, 0x90
.type test2::test2,@function
test2::test2:
.cfi_startproc
push rbp
.cfi_def_cfa_offset 16
.cfi_offset rbp, -16
mov rbp, rsp
.cfi_def_cfa_register rbp
push rbx
and rsp, -64
sub rsp, 128
.cfi_offset rbx, -24
mov rdx, rdi
mov eax, 255
bzhi esi, eax, esi
mov rbx, rsp
mov rdi, rbx
call core::core_arch::x86::avx512f::_mm512_maskz_loadu_epi32
#APP
#NO_APP
lea rsp, [rbp - 8]
pop rbx
pop rbp
.cfi_def_cfa rsp, 8
ret
When using RUSTFLAGS="-C target-cpu=native"
, it works well.
Also, when I replace _mm512_maskz_loadu_epi32
with _mm512_loadu_epi32
, it works well.
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi2")]
#[target_feature(enable = "avx512f")]
pub unsafe fn test3(ptr: *const i32, len: u32) {
let mask = _bzhi_u32(0xFFFF, len) as u16;
std::hint::black_box(mask);
std::hint::black_box(_mm512_loadu_epi32(ptr));
}
cargo asm
❯ cargo asm 4
Finished `release` profile [optimized] target(s) in 0.00s
.section .text.test2::test3,"ax",@progbits
.globl test2::test3
.p2align 4, 0x90
.type test2::test3,@function
test2::test3:
.cfi_startproc
push rbp
.cfi_def_cfa_offset 16
.cfi_offset rbp, -16
mov rbp, rsp
.cfi_def_cfa_register rbp
and rsp, -64
sub rsp, 128
mov eax, 65535
bzhi eax, eax, esi
mov word ptr [rsp], ax
mov rax, rsp
#APP
#NO_APP
vmovups zmm0, zmmword ptr [rdi]
vmovaps zmmword ptr [rsp], zmm0
mov rax, rsp
#APP
#NO_APP
mov rsp, rbp
pop rbp
.cfi_def_cfa rsp, 8
vzeroupper
ret
Meta
rustc --version --verbose
:
rustc 1.78.0-nightly (3246e7951 2024-02-19)
binary: rustc
commit-hash: 3246e79513cb89ddbfc0f21cb5a877e5b321dcc5
commit-date: 2024-02-19
host: x86_64-unknown-linux-gnu
release: 1.78.0-nightly
LLVM version: 18.1.0
Metadata
Metadata
Assignees
Labels
Area: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues.Area: Code generationCategory: This is a bug.Issue: Problems and improvements with respect to performance of generated code.Target: x86-64 processors (like x86_64-*) (also known as amd64 and x64)Issue expected to be fixed by the next major LLVM upgrade, or backported fixes