Add more SIMD intrinsics

antoyo · antoyo · commit cb36d78d7ba5 · 2024-09-15T17:24:52.000-04:00
diff --git a/src/base.rs b/src/base.rs
@@ -116,6 +116,10 @@ pub fn compile_codegen_unit(
             context.add_command_line_option("-mavx");
         }
 
+        /*for feature in tcx.sess.opts.cg.target_feature.split(',') {
+            println!("Feature: {}", feature);
+        }*/
+
         for arg in &tcx.sess.opts.cg.llvm_args {
             context.add_command_line_option(arg);
         }
@@ -218,6 +222,7 @@ pub fn compile_codegen_unit(
 
             // ... and now that we have everything pre-defined, fill out those definitions.
             for &(mono_item, _) in &mono_items {
+                //println!("{:?}", mono_item);
                 mono_item.define::<Builder<'_, '_, '_>>(&cx);
             }
 
diff --git a/src/builder.rs b/src/builder.rs
@@ -270,6 +270,8 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
                             actual_val.dereference(self.location).to_rvalue()
                         }
                     } else {
+                        // FIXME: this condition seems wrong: it will pass when both types are not
+                        // a vector.
                         assert!(
                             (!expected_ty.is_vector() || actual_ty.is_vector())
                                 && (expected_ty.is_vector() || !actual_ty.is_vector()),
@@ -283,6 +285,7 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
                         );
                         // TODO(antoyo): perhaps use __builtin_convertvector for vector casting.
                         // TODO: remove bitcast now that vector types can be compared?
+                        println!("Name: {}", func_name);
                         self.bitcast(actual_val, expected_ty)
                     }
                 } else {
diff --git a/src/declare.rs b/src/declare.rs
@@ -168,7 +168,23 @@ fn declare_raw_fn<'gcc>(
     variadic: bool,
 ) -> Function<'gcc> {
     if name.starts_with("llvm.") {
-        let intrinsic = llvm::intrinsic(name, cx);
+        let intrinsic = match name {
+            "llvm.fma.f16" => {
+                let param1 = cx.context.new_parameter(None, cx.double_type, "x");
+                let param2 = cx.context.new_parameter(None, cx.double_type, "y");
+                let param3 = cx.context.new_parameter(None, cx.double_type, "z");
+                cx.context.new_function(
+                    None,
+                    FunctionType::Extern,
+                    cx.double_type,
+                    &[param1, param2, param3],
+                    "fma",
+                    false,
+                )
+            }
+            _ => llvm::intrinsic(name, cx),
+        };
+
         cx.intrinsics.borrow_mut().insert(name.to_string(), intrinsic);
         return intrinsic;
     }
diff --git a/src/intrinsic/llvm.rs b/src/intrinsic/llvm.rs
@@ -1,5 +1,6 @@
 use std::borrow::Cow;
 
+use gccjit::CType;
 use gccjit::{Function, FunctionPtrType, RValue, ToRValue, UnaryOp};
 use rustc_codegen_ssa::traits::BuilderMethods;
 
@@ -320,7 +321,9 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(
             | "__builtin_ia32_vpmadd52luq512_mask"
             | "__builtin_ia32_vpmadd52huq256_mask"
             | "__builtin_ia32_vpmadd52luq256_mask"
-            | "__builtin_ia32_vpmadd52huq128_mask" => {
+            | "__builtin_ia32_vpmadd52huq128_mask"
+            | "__builtin_ia32_vfmaddsubph128_mask"
+            | "__builtin_ia32_vfmaddsubph256_mask" => {
                 let mut new_args = args.to_vec();
                 let arg4_type = gcc_func.get_param_type(3);
                 let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1);
@@ -440,6 +443,19 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(
                 new_args.push(last_arg);
                 args = new_args.into();
             }
+            // NOTE: the LLVM intrinsics receive 3 floats, but the GCC builtin requires 3 vectors.
+            "__builtin_ia32_vfmaddsh3_mask" => {
+                let new_args = args.to_vec();
+                let arg1_type = gcc_func.get_param_type(0);
+                let arg2_type = gcc_func.get_param_type(1);
+                let arg3_type = gcc_func.get_param_type(2);
+                let arg5_type = gcc_func.get_param_type(4);
+                let a = builder.context.new_rvalue_from_vector(None, arg1_type, &[new_args[0]; 8]);
+                let b = builder.context.new_rvalue_from_vector(None, arg2_type, &[new_args[1]; 8]);
+                let c = builder.context.new_rvalue_from_vector(None, arg3_type, &[new_args[2]; 8]);
+                let arg5 = builder.context.new_rvalue_from_int(arg5_type, 4);
+                args = vec![a, b, c, new_args[3], arg5].into();
+            }
             _ => (),
         }
     } else {
@@ -452,7 +468,7 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(
                 let arg4 = builder.context.new_bitcast(None, new_args[2], arg4_type);
                 args = vec![new_args[0], new_args[1], arg3, arg4, new_args[3], new_args[5]].into();
             }
-            // NOTE: the LLVM intrinsic receives 3 floats, but the GCC builtin requires 3 vectors.
+            // NOTE: the LLVM intrinsics receive 3 floats, but the GCC builtin requires 3 vectors.
             // FIXME: the intrinsics like _mm_mask_fmadd_sd should probably directly call the GCC
             // intrinsic to avoid this.
             "__builtin_ia32_vfmaddss3_round" => {
@@ -550,6 +566,25 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(
                 ]
                 .into();
             }
+            "__builtin_ia32_rndscalesh_mask_round" => {
+                let new_args = args.to_vec();
+                args = vec![
+                    new_args[0],
+                    new_args[1],
+                    new_args[4],
+                    new_args[2],
+                    new_args[3],
+                    new_args[5],
+                ]
+                .into();
+            }
+            "fma" => {
+                let mut new_args = args.to_vec();
+                new_args[0] = builder.context.new_cast(None, new_args[0], builder.double_type);
+                new_args[1] = builder.context.new_cast(None, new_args[1], builder.double_type);
+                new_args[2] = builder.context.new_cast(None, new_args[2], builder.double_type);
+                args = new_args.into();
+            }
             _ => (),
         }
     }
@@ -566,7 +601,9 @@ pub fn adjust_intrinsic_return_value<'a, 'gcc, 'tcx>(
     orig_args: &[RValue<'gcc>],
 ) -> RValue<'gcc> {
     match func_name {
-        "__builtin_ia32_vfmaddss3_round" | "__builtin_ia32_vfmaddsd3_round" => {
+        "__builtin_ia32_vfmaddss3_round"
+        | "__builtin_ia32_vfmaddsd3_round"
+        | "__builtin_ia32_vfmaddsh3_mask" => {
             #[cfg(feature = "master")]
             {
                 let zero = builder.context.new_rvalue_zero(builder.int_type);
@@ -625,6 +662,10 @@ pub fn adjust_intrinsic_return_value<'a, 'gcc, 'tcx>(
                 &[random_number, success_variable.to_rvalue()],
             );
         }
+        "fma" => {
+            let f16_type = builder.context.new_c_type(CType::Float16);
+            return_value = builder.context.new_cast(None, return_value, f16_type);
+        }
         _ => (),
     }
 
@@ -1165,6 +1206,9 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function
         "llvm.x86.avx512.mask.store.q.128" => "__builtin_ia32_movdqa64store128_mask",
         "llvm.x86.avx512.mask.store.ps.128" => "__builtin_ia32_storeaps128_mask",
         "llvm.x86.avx512.mask.store.pd.128" => "__builtin_ia32_storeapd128_mask",
+        "llvm.x86.avx512fp16.vfmadd.f16" => "__builtin_ia32_vfmaddsh3_mask",
+        "llvm.x86.avx512fp16.vfmaddsub.ph.128" => "__builtin_ia32_vfmaddsubph128_mask",
+        "llvm.x86.avx512fp16.vfmaddsub.ph.256" => "__builtin_ia32_vfmaddsubph256_mask",
 
         // TODO: support the tile builtins:
         "llvm.x86.ldtilecfg" => "__builtin_trap",

Original file line number	Diff line number	Diff line change
`@@ -116,6 +116,10 @@ pub fn compile_codegen_unit(`
`116`	`116`	`context.add_command_line_option("-mavx");`
`117`	`117`	`}`
`118`	`118`
	`119`	`+ /*for feature in tcx.sess.opts.cg.target_feature.split(',') {`
	`120`	`+ println!("Feature: {}", feature);`
	`121`	`+ }*/`
	`122`	`+`
`119`	`123`	`for arg in &tcx.sess.opts.cg.llvm_args {`
`120`	`124`	`context.add_command_line_option(arg);`
`121`	`125`	`}`
`@@ -218,6 +222,7 @@ pub fn compile_codegen_unit(`
`218`	`222`
`219`	`223`	`// ... and now that we have everything pre-defined, fill out those definitions.`
`220`	`224`	`for &(mono_item, _) in &mono_items {`
	`225`	`+ //println!("{:?}", mono_item);`
`221`	`226`	`mono_item.define::<Builder<'_, '_, '_>>(&cx);`
`222`	`227`	`}`
`223`	`228`