Skip to content

Special-case transmute for primitive, SIMD & pointer types. #19294

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 11, 2014

Conversation

huonw
Copy link
Member

@huonw huonw commented Nov 25, 2014

This detects (a subset of) the cases when transmute::<T, U>(x) can be
lowered to a direct bitcast T x to U in LLVM. This assists with
efficiently handling a SIMD vector as multiple different types,
e.g. swapping bytes/words/double words around inside some larger vector
type.

C compilers like GCC and Clang handle integer vector types as __m128i
for all widths, and implicitly insert bitcasts as required. This patch
allows Rust to express this, even if it takes a bit of unsafe, whereas
previously it was impossible to do at all without inline assembly.

Example:

pub fn reverse_u32s(u: u64x2) -> u64x2 {
    unsafe {
        let tmp = mem::transmute::<_, u32x4>(u);
        let swapped = u32x4(tmp.3, tmp.2, tmp.1, tmp.0);
        mem::transmute::<_, u64x2>(swapped)
    }
}

Compiling with --opt-level=3 gives:

Before

define <2 x i64> @_ZN12reverse_u32s20hbdb206aba18a03d8tbaE(<2 x i64>) unnamed_addr #0 {
entry-block:
  %1 = bitcast <2 x i64> %0 to i128
  %u.0.extract.trunc = trunc i128 %1 to i32
  %u.4.extract.shift = lshr i128 %1, 32
  %u.4.extract.trunc = trunc i128 %u.4.extract.shift to i32
  %u.8.extract.shift = lshr i128 %1, 64
  %u.8.extract.trunc = trunc i128 %u.8.extract.shift to i32
  %u.12.extract.shift = lshr i128 %1, 96
  %u.12.extract.trunc = trunc i128 %u.12.extract.shift to i32
  %2 = insertelement <4 x i32> undef, i32 %u.12.extract.trunc, i64 0
  %3 = insertelement <4 x i32> %2, i32 %u.8.extract.trunc, i64 1
  %4 = insertelement <4 x i32> %3, i32 %u.4.extract.trunc, i64 2
  %5 = insertelement <4 x i32> %4, i32 %u.0.extract.trunc, i64 3
  %6 = bitcast <4 x i32> %5 to <2 x i64>
  ret <2 x i64> %6
}

_ZN12reverse_u32s20hbdb206aba18a03d8tbaE:
    .cfi_startproc
    movd    %xmm0, %rax
    punpckhqdq  %xmm0, %xmm0
    movd    %xmm0, %rcx
    movq    %rcx, %rdx
    shrq    $32, %rdx
    movq    %rax, %rsi
    shrq    $32, %rsi
    movd    %eax, %xmm0
    movd    %ecx, %xmm1
    punpckldq   %xmm0, %xmm1
    movd    %esi, %xmm2
    movd    %edx, %xmm0
    punpckldq   %xmm2, %xmm0
    punpckldq   %xmm1, %xmm0
    retq

After

define <2 x i64> @_ZN12reverse_u32s20hbdb206aba18a03d8tbaE(<2 x i64>) unnamed_addr #0 {
entry-block:
  %1 = bitcast <2 x i64> %0 to <4 x i32>
  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
  %3 = bitcast <4 x i32> %2 to <2 x i64>
  ret <2 x i64> %3
}

_ZN12reverse_u32s20hbdb206aba18a03d8tbaE:
    .cfi_startproc
    pshufd  $27, %xmm0, %xmm0
    retq

huonw added a commit to huonw/dsfmt-rs that referenced this pull request Nov 25, 2014
This has several advantages.

- it is faster in the microbenchmark (after a patch to rustc),
  presumably due to the fact the compiler has full control over data
  flow and data types, and so can arrange the loads optimally and avoid
  unnecessarily placing things onto the stack. It also means the
  compiler can see the SIMD vectors are always in the integer domain and
  so use (on x86) MOVDQA rather than the floating-point MOVAPS, avoiding
  the cross-domain penalties on platforms which have them.

- the library is now usable on x86 platforms without SSE2, and also on
  non-x86 platforms, possibly at a performance penalty (but maybe not,
  e.g. it may work with ARM's NEON), but this is better than not being
  usable at all.

- the danger and problems of `asm!` is completely removed, replaced with
  a pair of easy to verify `transmute`s.

Before:

    test bench_mt19937_1_000_000_rands ... bench:   1606892 ns/iter (+/- 57461)

After, with rust-lang/rust#19294:

    test bench_mt19937_1_000_000_rands ... bench:   1539038 ns/iter (+/- 33623)

Without that patch it takes `2787449 ns/iter`.
huonw added a commit to huonw/dsfmt-rs that referenced this pull request Nov 25, 2014
This has several advantages.

- it is faster in the microbenchmark (after a patch to rustc),
  presumably due to the fact the compiler has full control over data
  flow and data types, and so can arrange the loads optimally and avoid
  unnecessarily placing things onto the stack. It also means the
  compiler can see the SIMD vectors are always in the integer domain and
  so use (on x86) MOVDQA rather than the floating-point MOVAPS, avoiding
  the cross-domain penalties on platforms which have them.

- the library is now usable on x86 platforms without SSE2, and also on
  non-x86 platforms, possibly at a performance penalty (but maybe not,
  e.g. it may work with ARM's NEON), but this is better than not being
  usable at all.

- the danger and problems of `asm!` is completely removed, replaced with
  a pair of easy to verify `transmute`s.

Before:

    test bench_mt19937_1_000_000_rands ... bench:   1606892 ns/iter (+/- 57461)

After, with rust-lang/rust#19294:

    test bench_mt19937_1_000_000_rands ... bench:   1539038 ns/iter (+/- 33623)

Without that patch it takes `2787449 ns/iter`.

The last of the three (essentially identical) inner loops of the
benchmark are given below. They demonstrate the advantages listed above.

Before:

    .LBB2_6:
    	movaps	3056(%rax,%rdx), %xmm2
    	movaps	1872(%rax,%rdx), %xmm3
    	movaps	%xmm2, (%rsp)
    	#APP
    	psllq	$19, %xmm2
    	pxor	%xmm3, %xmm2
    	pshufd	$27, %xmm1, %xmm1
    	pxor	%xmm2, %xmm1
    	movaps	%xmm1, %xmm2
    	movaps	%xmm1, %xmm3
    	pand	%xmm0, %xmm2
    	psrlq	$12, %xmm3
    	pxor	(%rsp), %xmm3
    	pxor	%xmm2, %xmm3
    	#NO_APP
    	movaps	%xmm3, 3056(%rax,%rdx)
    	addq	$16, %rdx
    	jne	.LBB2_6

After:

    .LBB2_4:
    	movdqa	3056(%rax,%rdx), %xmm2
    	pshufd	$27, %xmm1, %xmm3
    	movdqa	%xmm2, %xmm1
    	psllq	$19, %xmm1
    	pxor	1872(%rax,%rdx), %xmm1
    	pxor	%xmm3, %xmm1
    	movdqa	%xmm1, %xmm3
    	psrlq	$12, %xmm3
    	movdqa	%xmm1, %xmm4
    	pand	%xmm0, %xmm4
    	pxor	%xmm3, %xmm4
    	pxor	%xmm2, %xmm4
    	movdqa	%xmm4, 3056(%rax,%rdx)
    	addq	$16, %rdx
    	jne	.LBB2_4
// Doing this special case makes conversions like `u32x4` ->
// `u64x2` much nicer for LLVM and so more efficient (these
// are done efficiently implicitly in C with the `__m128i`
// type and so this means Rust doesn't loose out there).
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"lose", not "loose"

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whoops, thanks.

@huonw
Copy link
Member Author

huonw commented Nov 25, 2014

Grieverheart/dsfmt-rs#2 is a real-world example of something this helps.

@huonw huonw force-pushed the transmute-inplace branch from a518b54 to 59f7f38 Compare November 25, 2014 10:23
// are done efficiently implicitly in C with the `__m128i`
// type and so this means Rust doesn't lose out there).
let DatumBlock { bcx: bcx2, datum } = expr::trans(bcx, &*arg_exprs[0]);
bcx = bcx2;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Equivalent to let datum = unpack_datum!(bcx, expr::trans(bcx, &*arg_exprs[0]));.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed.

@huonw huonw force-pushed the transmute-inplace branch from 59f7f38 to 9fc66d9 Compare November 25, 2014 12:41
@eddyb
Copy link
Member

eddyb commented Nov 25, 2014

What does this do for transmute::<(), ()>(())? I know it's not a real world usecase, but () is currently classified as a scalar and its LLVM type is {} which I suspect is not valid for a bitcast.

IMO the use of type_is_scalar is ad-hoc and fragile, could we not use the LLVM type instead?
I'm thinking integers, floating-point and vectors ought to be enough for now.

This way we could also (easily) handle pointer -> pointer and pointer -> integer / integer -> pointer casts, even when the Rust type doesn't readily provide this information (trans::adt could very well flatten an enum+newtype hierarchy many levels deep into a single pointer).

@nikomatsakis
Copy link
Contributor

I think in principle this makes sense. @eddyb is probably right that examining the LLVM types may be a good idea.

@huonw huonw force-pushed the transmute-inplace branch from 9fc66d9 to 2c2041a Compare November 25, 2014 22:36
@huonw
Copy link
Member Author

huonw commented Nov 25, 2014

Updated to look at the LLVM type. This also now includes detecting the pointer -> pointer case for bitcast.

@eddyb
Copy link
Member

eddyb commented Nov 26, 2014

I expect this to work fine for types that implement Drop if datum.to_llscalarish() is replaced with something like if datum.kind.is_by_ref() { load_ty(bcx, datum.val, datum.ty) } else { datum.val }.

@huonw
Copy link
Member Author

huonw commented Nov 27, 2014

Making this change:

--- a/src/librustc_trans/trans/intrinsic.rs
+++ b/src/librustc_trans/trans/intrinsic.rs
@@ -192,11 +192,8 @@ pub fn trans_intrinsic_call<'a, 'blk, 'tcx>(mut bcx: Block<'blk, 'tcx>,
                     (nonpointer_nonaggregate(in_kind) && nonpointer_nonaggregate(ret_kind)) || {
                         in_kind == TypeKind::Pointer && ret_kind == TypeKind::Pointer
                     };
-                let primitive =
-                    !ty::type_needs_drop(ccx.tcx(), in_type) &&
-                    !ty::type_needs_drop(ccx.tcx(), ret_ty.unwrap());

-                let dest = if bitcast_compatible && primitive {
+                let dest = if bitcast_compatible {
                     // if we're here, the type is scalar-like (a primitive or a
                     // SIMD type), and so can be handled as a by-value ValueRef
                     // and can also be directly bitcast to the target type.
@@ -205,7 +202,11 @@ pub fn trans_intrinsic_call<'a, 'blk, 'tcx>(mut bcx: Block<'blk, 'tcx>,
                     // are done efficiently implicitly in C with the `__m128i`
                     // type and so this means Rust doesn't lose out there).
                     let datum = unpack_datum!(bcx, expr::trans(bcx, &*arg_exprs[0]));
-                    let val = datum.to_llscalarish(bcx);
+                    let val = if datum.kind.is_by_ref() {
+                        load_ty(bcx, datum.val, datum.ty)
+                    } else {
+                        datum.val
+                    };
                     let cast_val = BitCast(bcx, val, llret_ty);

                     match dest {

gives me aborts compiling stage2 core (i.e. the stage1 compiler is codegened incorrectly):

rustc: x86_64-unknown-linux-gnu/stage2/lib/rustlib/x86_64-unknown-linux-gnu/lib/libcore

You've met with a terrible fate, haven't you?

fatal runtime error:  assertion failed: self.state != Armed

I assume, if it exists, destructor of the input (i.e. datum.val) needs to be cancelled, but I do not know how to do this. Handling destructors correctly would seem to be significantly more error prone, which may not be what we want for this minor tweak mainly designed to allow slightly better handling of SIMD types.

@huonw huonw force-pushed the transmute-inplace branch from 2c2041a to aca6f14 Compare December 4, 2014 01:13
@huonw huonw changed the title Special-case transmute for primitive & SIMD types. Special-case transmute for primitive, SIMD & pointer types. Dec 4, 2014
This detects (a subset of) the cases when `transmute::<T, U>(x)` can be
lowered to a direct `bitcast T x to U` in LLVM. This assists with
efficiently handling a SIMD vector as multiple different types,
e.g. swapping bytes/words/double words around inside some larger vector
type.

C compilers like GCC and Clang handle integer vector types as `__m128i`
for all widths, and implicitly insert bitcasts as required. This patch
allows Rust to express this, even if it takes a bit of `unsafe`, whereas
previously it was impossible to do at all without inline assembly.

Example:

    pub fn reverse_u32s(u: u64x2) -> u64x2 {
        unsafe {
            let tmp = mem::transmute::<_, u32x4>(u);
            let swapped = u32x4(tmp.3, tmp.2, tmp.1, tmp.0);
            mem::transmute::<_, u64x2>(swapped)
        }
    }

Compiling with `--opt-level=3` gives:

Before

    define <2 x i64> @_ZN12reverse_u32s20hbdb206aba18a03d8tbaE(<2 x i64>) unnamed_addr #0 {
    entry-block:
      %1 = bitcast <2 x i64> %0 to i128
      %u.0.extract.trunc = trunc i128 %1 to i32
      %u.4.extract.shift = lshr i128 %1, 32
      %u.4.extract.trunc = trunc i128 %u.4.extract.shift to i32
      %u.8.extract.shift = lshr i128 %1, 64
      %u.8.extract.trunc = trunc i128 %u.8.extract.shift to i32
      %u.12.extract.shift = lshr i128 %1, 96
      %u.12.extract.trunc = trunc i128 %u.12.extract.shift to i32
      %2 = insertelement <4 x i32> undef, i32 %u.12.extract.trunc, i64 0
      %3 = insertelement <4 x i32> %2, i32 %u.8.extract.trunc, i64 1
      %4 = insertelement <4 x i32> %3, i32 %u.4.extract.trunc, i64 2
      %5 = insertelement <4 x i32> %4, i32 %u.0.extract.trunc, i64 3
      %6 = bitcast <4 x i32> %5 to <2 x i64>
      ret <2 x i64> %6
    }

    _ZN12reverse_u32s20hbdb206aba18a03d8tbaE:
    	.cfi_startproc
    	movd	%xmm0, %rax
    	punpckhqdq	%xmm0, %xmm0
    	movd	%xmm0, %rcx
    	movq	%rcx, %rdx
    	shrq	$32, %rdx
    	movq	%rax, %rsi
    	shrq	$32, %rsi
    	movd	%eax, %xmm0
    	movd	%ecx, %xmm1
    	punpckldq	%xmm0, %xmm1
    	movd	%esi, %xmm2
    	movd	%edx, %xmm0
    	punpckldq	%xmm2, %xmm0
    	punpckldq	%xmm1, %xmm0
    	retq

After

    define <2 x i64> @_ZN12reverse_u32s20hbdb206aba18a03d8tbaE(<2 x i64>) unnamed_addr #0 {
    entry-block:
      %1 = bitcast <2 x i64> %0 to <4 x i32>
      %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
      %3 = bitcast <4 x i32> %2 to <2 x i64>
      ret <2 x i64> %3
    }

    _ZN12reverse_u32s20hbdb206aba18a03d8tbaE:
    	.cfi_startproc
    	pshufd	$27, %xmm0, %xmm0
    	retq
@huonw huonw force-pushed the transmute-inplace branch from aca6f14 to 1a62066 Compare December 4, 2014 01:15
bors added a commit that referenced this pull request Dec 11, 2014
This detects (a subset of) the cases when `transmute::<T, U>(x)` can be
lowered to a direct `bitcast T x to U` in LLVM. This assists with
efficiently handling a SIMD vector as multiple different types,
e.g. swapping bytes/words/double words around inside some larger vector
type.

C compilers like GCC and Clang handle integer vector types as `__m128i`
for all widths, and implicitly insert bitcasts as required. This patch
allows Rust to express this, even if it takes a bit of `unsafe`, whereas
previously it was impossible to do at all without inline assembly.

Example:

    pub fn reverse_u32s(u: u64x2) -> u64x2 {
        unsafe {
            let tmp = mem::transmute::<_, u32x4>(u);
            let swapped = u32x4(tmp.3, tmp.2, tmp.1, tmp.0);
            mem::transmute::<_, u64x2>(swapped)
        }
    }

Compiling with `--opt-level=3` gives:

Before

    define <2 x i64> @_ZN12reverse_u32s20hbdb206aba18a03d8tbaE(<2 x i64>) unnamed_addr #0 {
    entry-block:
      %1 = bitcast <2 x i64> %0 to i128
      %u.0.extract.trunc = trunc i128 %1 to i32
      %u.4.extract.shift = lshr i128 %1, 32
      %u.4.extract.trunc = trunc i128 %u.4.extract.shift to i32
      %u.8.extract.shift = lshr i128 %1, 64
      %u.8.extract.trunc = trunc i128 %u.8.extract.shift to i32
      %u.12.extract.shift = lshr i128 %1, 96
      %u.12.extract.trunc = trunc i128 %u.12.extract.shift to i32
      %2 = insertelement <4 x i32> undef, i32 %u.12.extract.trunc, i64 0
      %3 = insertelement <4 x i32> %2, i32 %u.8.extract.trunc, i64 1
      %4 = insertelement <4 x i32> %3, i32 %u.4.extract.trunc, i64 2
      %5 = insertelement <4 x i32> %4, i32 %u.0.extract.trunc, i64 3
      %6 = bitcast <4 x i32> %5 to <2 x i64>
      ret <2 x i64> %6
    }

    _ZN12reverse_u32s20hbdb206aba18a03d8tbaE:
    	.cfi_startproc
    	movd	%xmm0, %rax
    	punpckhqdq	%xmm0, %xmm0
    	movd	%xmm0, %rcx
    	movq	%rcx, %rdx
    	shrq	$32, %rdx
    	movq	%rax, %rsi
    	shrq	$32, %rsi
    	movd	%eax, %xmm0
    	movd	%ecx, %xmm1
    	punpckldq	%xmm0, %xmm1
    	movd	%esi, %xmm2
    	movd	%edx, %xmm0
    	punpckldq	%xmm2, %xmm0
    	punpckldq	%xmm1, %xmm0
    	retq

After

    define <2 x i64> @_ZN12reverse_u32s20hbdb206aba18a03d8tbaE(<2 x i64>) unnamed_addr #0 {
    entry-block:
      %1 = bitcast <2 x i64> %0 to <4 x i32>
      %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
      %3 = bitcast <4 x i32> %2 to <2 x i64>
      ret <2 x i64> %3
    }

    _ZN12reverse_u32s20hbdb206aba18a03d8tbaE:
    	.cfi_startproc
    	pshufd	$27, %xmm0, %xmm0
    	retq
@alexcrichton alexcrichton merged commit 1a62066 into rust-lang:master Dec 11, 2014
@alexcrichton
Copy link
Member

Looks like bors stalled out on this PR, but it passed all platforms but the one slave that was lost, so I have merged this manually.

@huonw huonw deleted the transmute-inplace branch December 16, 2014 02:29
huonw added a commit to huonw/dsfmt-rs that referenced this pull request Dec 16, 2014
This has several advantages.

- it is faster in the microbenchmark (after a patch to rustc),
  presumably due to the fact the compiler has full control over data
  flow and data types, and so can arrange the loads optimally and avoid
  unnecessarily placing things onto the stack. It also means the
  compiler can see the SIMD vectors are always in the integer domain and
  so use (on x86) MOVDQA rather than the floating-point MOVAPS, avoiding
  the cross-domain penalties on platforms which have them.

- the library is now usable on x86 platforms without SSE2, and also on
  non-x86 platforms, possibly at a performance penalty (but maybe not,
  e.g. it may work with ARM's NEON), but this is better than not being
  usable at all.

- the danger and problems of `asm!` is completely removed, replaced with
  a pair of easy to verify `transmute`s.

Before:

    test bench_mt19937_1_000_000_rands ... bench:   1606892 ns/iter (+/- 57461)

After, with rust-lang/rust#19294:

    test bench_mt19937_1_000_000_rands ... bench:   1539038 ns/iter (+/- 33623)

Without that patch it takes `2787449 ns/iter`.

The last of the three (essentially identical) inner loops of the
benchmark are given below. They demonstrate the advantages listed above.

Before:

    .LBB2_6:
    	movaps	3056(%rax,%rdx), %xmm2
    	movaps	1872(%rax,%rdx), %xmm3
    	movaps	%xmm2, (%rsp)
    	#APP
    	psllq	$19, %xmm2
    	pxor	%xmm3, %xmm2
    	pshufd	$27, %xmm1, %xmm1
    	pxor	%xmm2, %xmm1
    	movaps	%xmm1, %xmm2
    	movaps	%xmm1, %xmm3
    	pand	%xmm0, %xmm2
    	psrlq	$12, %xmm3
    	pxor	(%rsp), %xmm3
    	pxor	%xmm2, %xmm3
    	#NO_APP
    	movaps	%xmm3, 3056(%rax,%rdx)
    	addq	$16, %rdx
    	jne	.LBB2_6

After:

    .LBB2_4:
    	movdqa	3056(%rax,%rdx), %xmm2
    	pshufd	$27, %xmm1, %xmm3
    	movdqa	%xmm2, %xmm1
    	psllq	$19, %xmm1
    	pxor	1872(%rax,%rdx), %xmm1
    	pxor	%xmm3, %xmm1
    	movdqa	%xmm1, %xmm3
    	psrlq	$12, %xmm3
    	movdqa	%xmm1, %xmm4
    	pand	%xmm0, %xmm4
    	pxor	%xmm3, %xmm4
    	pxor	%xmm2, %xmm4
    	movdqa	%xmm4, 3056(%rax,%rdx)
    	addq	$16, %rdx
    	jne	.LBB2_4
lnicola pushed a commit to lnicola/rust that referenced this pull request Apr 28, 2025
…_err_msgs

minor: Show build scripts errors in server status
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

6 participants