diff --git a/ptx/lib/zluda_ptx_impl.cl b/ptx/lib/zluda_ptx_impl.cl index a878ddd..85958d5 100644 --- a/ptx/lib/zluda_ptx_impl.cl +++ b/ptx/lib/zluda_ptx_impl.cl @@ -1,7 +1,10 @@ // Every time this file changes it must te rebuilt: // ocloc -file zluda_ptx_impl.cl -64 -options "-cl-std=CL2.0 -Dcl_intel_bit_instructions" -out_dir . -device kbl -output_no_suffix -spv_only // Additionally you should strip names: -// spirv-opt --strip-debug zluda_ptx_impl.spv -o zluda_ptx_impl.spv +// spirv-opt --strip-debug zluda_ptx_impl.spv -o zluda_ptx_impl.spv --target-env=spv1.3 + +#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable #define FUNC(NAME) __zluda_ptx_impl__ ## NAME @@ -25,6 +28,20 @@ return expected; \ } +#define atomic_add(NAME, SUCCESS, FAILURE, SCOPE, SPACE, TYPE, ATOMIC_TYPE, INT_TYPE) \ + TYPE FUNC(NAME)(SPACE TYPE* ptr, TYPE value) { \ + volatile SPACE ATOMIC_TYPE* atomic_ptr = (volatile SPACE ATOMIC_TYPE*)ptr; \ + union { \ + INT_TYPE int_view; \ + TYPE float_view; \ + } expected, desired; \ + expected.float_view = *ptr; \ + do { \ + desired.float_view = expected.float_view + value; \ + } while (!atomic_compare_exchange_strong_explicit(atomic_ptr, &expected.int_view, desired.int_view, SUCCESS, FAILURE, SCOPE)); \ + return expected.float_view; \ + } + // We are doing all this mess instead of accepting memory_order and memory_scope parameters // because ocloc emits broken (failing spirv-dis) SPIR-V when memory_order or memory_scope is a parameter @@ -120,6 +137,98 @@ atomic_dec(atom_acquire_sys_shared_dec, memory_order_acquire, memory_order_acqui atomic_dec(atom_release_sys_shared_dec, memory_order_release, memory_order_acquire, memory_scope_device, __local); atomic_dec(atom_acq_rel_sys_shared_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local); +// atom.add.f32 +atomic_add(atom_relaxed_cta_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , float, atomic_uint, uint); +atomic_add(atom_acquire_cta_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint); +atomic_add(atom_release_cta_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint); +atomic_add(atom_acq_rel_cta_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint); + +atomic_add(atom_relaxed_gpu_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint); +atomic_add(atom_acquire_gpu_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint); +atomic_add(atom_release_gpu_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint); +atomic_add(atom_acq_rel_gpu_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint); + +atomic_add(atom_relaxed_sys_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint); +atomic_add(atom_acquire_sys_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint); +atomic_add(atom_release_sys_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint); +atomic_add(atom_acq_rel_sys_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint); + +atomic_add(atom_relaxed_cta_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, float, atomic_uint, uint); +atomic_add(atom_acquire_cta_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint); +atomic_add(atom_release_cta_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint); +atomic_add(atom_acq_rel_cta_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint); + +atomic_add(atom_relaxed_gpu_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint); +atomic_add(atom_acquire_gpu_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint); +atomic_add(atom_release_gpu_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint); +atomic_add(atom_acq_rel_gpu_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint); + +atomic_add(atom_relaxed_sys_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint); +atomic_add(atom_acquire_sys_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint); +atomic_add(atom_release_sys_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint); +atomic_add(atom_acq_rel_sys_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint); + +atomic_add(atom_relaxed_cta_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, float, atomic_uint, uint); +atomic_add(atom_acquire_cta_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint); +atomic_add(atom_release_cta_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint); +atomic_add(atom_acq_rel_cta_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint); + +atomic_add(atom_relaxed_gpu_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint); +atomic_add(atom_acquire_gpu_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint); +atomic_add(atom_release_gpu_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint); +atomic_add(atom_acq_rel_gpu_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint); + +atomic_add(atom_relaxed_sys_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint); +atomic_add(atom_acquire_sys_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint); +atomic_add(atom_release_sys_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint); +atomic_add(atom_acq_rel_sys_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint); + +atomic_add(atom_relaxed_cta_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , double, atomic_ulong, ulong); +atomic_add(atom_acquire_cta_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong); +atomic_add(atom_release_cta_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong); +atomic_add(atom_acq_rel_cta_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong); + +atomic_add(atom_relaxed_gpu_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong); +atomic_add(atom_acquire_gpu_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong); +atomic_add(atom_release_gpu_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong); +atomic_add(atom_acq_rel_gpu_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong); + +atomic_add(atom_relaxed_sys_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong); +atomic_add(atom_acquire_sys_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong); +atomic_add(atom_release_sys_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong); +atomic_add(atom_acq_rel_sys_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong); + +// atom.add.f64 +atomic_add(atom_relaxed_cta_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, double, atomic_ulong, ulong); +atomic_add(atom_acquire_cta_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong); +atomic_add(atom_release_cta_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong); +atomic_add(atom_acq_rel_cta_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong); + +atomic_add(atom_relaxed_gpu_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong); +atomic_add(atom_acquire_gpu_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong); +atomic_add(atom_release_gpu_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong); +atomic_add(atom_acq_rel_gpu_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong); + +atomic_add(atom_relaxed_sys_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong); +atomic_add(atom_acquire_sys_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong); +atomic_add(atom_release_sys_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong); +atomic_add(atom_acq_rel_sys_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong); + +atomic_add(atom_relaxed_cta_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, double, atomic_ulong, ulong); +atomic_add(atom_acquire_cta_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong); +atomic_add(atom_release_cta_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong); +atomic_add(atom_acq_rel_cta_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong); + +atomic_add(atom_relaxed_gpu_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong); +atomic_add(atom_acquire_gpu_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong); +atomic_add(atom_release_gpu_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong); +atomic_add(atom_acq_rel_gpu_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong); + +atomic_add(atom_relaxed_sys_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong); +atomic_add(atom_acquire_sys_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong); +atomic_add(atom_release_sys_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong); +atomic_add(atom_acq_rel_sys_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong); + uint FUNC(bfe_u32)(uint base, uint pos, uint len) { return intel_ubfe(base, pos, len); } @@ -136,11 +245,11 @@ long FUNC(bfe_s64)(long base, uint pos, uint len) { return intel_sbfe(base, pos, len); } -uint FUNC(bfi_b32)(uint base, uint insert, uint offset, uint count) { +uint FUNC(bfi_b32)(uint insert, uint base, uint offset, uint count) { return intel_bfi(base, insert, offset, count); } -ulong FUNC(bfi_b64)(ulong base, ulong insert, uint offset, uint count) { +ulong FUNC(bfi_b64)(ulong insert, ulong base, uint offset, uint count) { return intel_bfi(base, insert, offset, count); } diff --git a/ptx/lib/zluda_ptx_impl.spv b/ptx/lib/zluda_ptx_impl.spv index 8a2d697..ca16447 100644 Binary files a/ptx/lib/zluda_ptx_impl.spv and b/ptx/lib/zluda_ptx_impl.spv differ diff --git a/ptx/src/test/spirv_run/atom_add_float.ptx b/ptx/src/test/spirv_run/atom_add_float.ptx new file mode 100644 index 0000000..3e3b748 --- /dev/null +++ b/ptx/src/test/spirv_run/atom_add_float.ptx @@ -0,0 +1,28 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry atom_add_float( + .param .u64 input, + .param .u64 output +) +{ + .shared .align 4 .b8 shared_mem[1024]; + + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .f32 temp1; + .reg .f32 temp2; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.f32 temp1, [in_addr]; + ld.f32 temp2, [in_addr+4]; + st.shared.f32 [shared_mem], temp1; + atom.shared.add.f32 temp1, [shared_mem], temp2; + ld.shared.f32 temp2, [shared_mem]; + st.f32 [out_addr], temp1; + st.f32 [out_addr+4], temp2; + ret; +} diff --git a/ptx/src/test/spirv_run/atom_add_float.spvtxt b/ptx/src/test/spirv_run/atom_add_float.spvtxt new file mode 100644 index 0000000..c2292f1 --- /dev/null +++ b/ptx/src/test/spirv_run/atom_add_float.spvtxt @@ -0,0 +1,81 @@ + OpCapability GenericPointer + OpCapability Linkage + OpCapability Addresses + OpCapability Kernel + OpCapability Int8 + OpCapability Int16 + OpCapability Int64 + OpCapability Float16 + OpCapability Float64 + %42 = OpExtInstImport "OpenCL.std" + OpMemoryModel Physical64 OpenCL + OpEntryPoint Kernel %1 "atom_add_float" %4 + OpDecorate %37 LinkageAttributes "__zluda_ptx_impl__atom_relaxed_gpu_shared_add_f32" Import + OpDecorate %4 Alignment 4 + %void = OpTypeVoid + %float = OpTypeFloat 32 +%_ptr_Workgroup_float = OpTypePointer Workgroup %float + %46 = OpTypeFunction %float %_ptr_Workgroup_float %float + %uint = OpTypeInt 32 0 + %uchar = OpTypeInt 8 0 + %uint_1024 = OpConstant %uint 1024 +%_arr_uchar_uint_1024 = OpTypeArray %uchar %uint_1024 +%_ptr_Workgroup__arr_uchar_uint_1024 = OpTypePointer Workgroup %_arr_uchar_uint_1024 + %4 = OpVariable %_ptr_Workgroup__arr_uchar_uint_1024 Workgroup + %ulong = OpTypeInt 64 0 + %53 = OpTypeFunction %void %ulong %ulong +%_ptr_Function_ulong = OpTypePointer Function %ulong +%_ptr_Function_float = OpTypePointer Function %float +%_ptr_Generic_float = OpTypePointer Generic %float + %ulong_4 = OpConstant %ulong 4 + %ulong_4_0 = OpConstant %ulong 4 + %37 = OpFunction %float None %46 + %39 = OpFunctionParameter %_ptr_Workgroup_float + %40 = OpFunctionParameter %float + OpFunctionEnd + %1 = OpFunction %void None %53 + %9 = OpFunctionParameter %ulong + %10 = OpFunctionParameter %ulong + %36 = OpLabel + %2 = OpVariable %_ptr_Function_ulong Function + %3 = OpVariable %_ptr_Function_ulong Function + %5 = OpVariable %_ptr_Function_ulong Function + %6 = OpVariable %_ptr_Function_ulong Function + %7 = OpVariable %_ptr_Function_float Function + %8 = OpVariable %_ptr_Function_float Function + OpStore %2 %9 + OpStore %3 %10 + %11 = OpLoad %ulong %2 Aligned 8 + OpStore %5 %11 + %12 = OpLoad %ulong %3 Aligned 8 + OpStore %6 %12 + %14 = OpLoad %ulong %5 + %29 = OpConvertUToPtr %_ptr_Generic_float %14 + %13 = OpLoad %float %29 Aligned 4 + OpStore %7 %13 + %16 = OpLoad %ulong %5 + %26 = OpIAdd %ulong %16 %ulong_4 + %30 = OpConvertUToPtr %_ptr_Generic_float %26 + %15 = OpLoad %float %30 Aligned 4 + OpStore %8 %15 + %17 = OpLoad %float %7 + %31 = OpBitcast %_ptr_Workgroup_float %4 + OpStore %31 %17 Aligned 4 + %19 = OpLoad %float %8 + %32 = OpBitcast %_ptr_Workgroup_float %4 + %18 = OpFunctionCall %float %37 %32 %19 + OpStore %7 %18 + %33 = OpBitcast %_ptr_Workgroup_float %4 + %20 = OpLoad %float %33 Aligned 4 + OpStore %8 %20 + %21 = OpLoad %ulong %6 + %22 = OpLoad %float %7 + %34 = OpConvertUToPtr %_ptr_Generic_float %21 + OpStore %34 %22 Aligned 4 + %23 = OpLoad %ulong %6 + %24 = OpLoad %float %8 + %28 = OpIAdd %ulong %23 %ulong_4_0 + %35 = OpConvertUToPtr %_ptr_Generic_float %28 + OpStore %35 %24 Aligned 4 + OpReturn + OpFunctionEnd diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs index c99de17..c802320 100644 --- a/ptx/src/test/spirv_run/mod.rs +++ b/ptx/src/test/spirv_run/mod.rs @@ -155,6 +155,7 @@ test_ptx!(cvt_s64_s32, [-1i32], [-1i64]); test_ptx!(add_tuning, [2u64], [3u64]); test_ptx!(add_non_coherent, [3u64], [4u64]); test_ptx!(sign_extend, [-1i16], [-1i32]); +test_ptx!(atom_add_float, [1.25f32, 0.5f32], [1.25f32, 1.75f32]); struct DisplayError { err: T, diff --git a/ptx/src/translate.rs b/ptx/src/translate.rs index 7566be8..3291ad5 100644 --- a/ptx/src/translate.rs +++ b/ptx/src/translate.rs @@ -1505,6 +1505,7 @@ fn extract_globals<'input, 'b>( d, a, "inc", + ast::SizedScalarType::U32, )); } Statement::Instruction(ast::Instruction::Atom( @@ -1526,6 +1527,44 @@ fn extract_globals<'input, 'b>( d, a, "dec", + ast::SizedScalarType::U32, + )); + } + Statement::Instruction(ast::Instruction::Atom( + ast::AtomDetails { + inner: + ast::AtomInnerDetails::Float { + op: ast::AtomFloatOp::Add, + typ, + }, + semantics, + scope, + space, + }, + a, + )) => { + let details = ast::AtomDetails { + inner: ast::AtomInnerDetails::Float { + op: ast::AtomFloatOp::Add, + typ, + }, + semantics, + scope, + space, + }; + let (op, typ) = match typ { + ast::FloatType::F32 => ("add_f32", ast::SizedScalarType::F32), + ast::FloatType::F64 => ("add_f64", ast::SizedScalarType::F64), + ast::FloatType::F16 => unreachable!(), + ast::FloatType::F16x2 => unreachable!(), + }; + local.push(to_ptx_impl_atomic_call( + id_def, + ptx_impl_imports, + details, + a, + op, + typ, )); } s => local.push(s), @@ -1696,6 +1735,7 @@ fn to_ptx_impl_atomic_call( details: ast::AtomDetails, arg: ast::Arg3, op: &'static str, + typ: ast::SizedScalarType, ) -> ExpandedStatement { let semantics = ptx_semantics_name(details.semantics); let scope = ptx_scope_name(details.scope); @@ -1710,15 +1750,14 @@ fn to_ptx_impl_atomic_call( ast::AtomSpace::Global => ast::PointerStateSpace::Global, ast::AtomSpace::Shared => ast::PointerStateSpace::Shared, }; + let scalar_typ = ast::ScalarType::from(typ); let fn_id = match ptx_impl_imports.entry(fn_name) { hash_map::Entry::Vacant(entry) => { let fn_id = id_defs.new_non_variable(None); let func_decl = ast::MethodDecl::Func::( vec![ast::FnArgument { align: None, - v_type: ast::FnArgumentType::Reg(ast::VariableRegType::Scalar( - ast::ScalarType::U32, - )), + v_type: ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(scalar_typ)), name: id_defs.new_non_variable(None), array_init: Vec::new(), }], @@ -1727,17 +1766,14 @@ fn to_ptx_impl_atomic_call( ast::FnArgument { align: None, v_type: ast::FnArgumentType::Reg(ast::VariableRegType::Pointer( - ast::SizedScalarType::U32, - ptr_space, + typ, ptr_space, )), name: id_defs.new_non_variable(None), array_init: Vec::new(), }, ast::FnArgument { align: None, - v_type: ast::FnArgumentType::Reg(ast::VariableRegType::Scalar( - ast::ScalarType::U32, - )), + v_type: ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(scalar_typ)), name: id_defs.new_non_variable(None), array_init: Vec::new(), }, @@ -1768,19 +1804,16 @@ fn to_ptx_impl_atomic_call( func: fn_id, ret_params: vec![( arg.dst, - ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(ast::ScalarType::U32)), + ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(scalar_typ)), )], param_list: vec![ ( arg.src1, - ast::FnArgumentType::Reg(ast::VariableRegType::Pointer( - ast::SizedScalarType::U32, - ptr_space, - )), + ast::FnArgumentType::Reg(ast::VariableRegType::Pointer(typ, ptr_space)), ), ( arg.src2, - ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(ast::ScalarType::U32)), + ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(scalar_typ)), ), ], }) @@ -1963,14 +1996,13 @@ fn to_ptx_impl_bfi_call( arg.dst, ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(typ.into())), )], - // Note, for some reason PTX and SPIR-V order base&insert arguments differently param_list: vec![ ( - arg.src2, + arg.src1, ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(typ.into())), ), ( - arg.src1, + arg.src2, ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(typ.into())), ), ( @@ -3476,8 +3508,12 @@ fn emit_atom( }; (spirv_op, typ.into()) } - // TODO: Hardware is capable of this, implement it through builtin - ast::AtomInnerDetails::Float { .. } => todo!(), + ast::AtomInnerDetails::Float { op, typ } => { + let spirv_op: fn(&mut dr::Builder, _, _, _, _, _, _) -> _ = match op { + ast::AtomFloatOp::Add => dr::Builder::atomic_f_add_ext, + }; + (spirv_op, typ.into()) + } }; let result_type = map.get_or_add_scalar(builder, typ); let memory_const = map.get_or_add_constant( @@ -4287,8 +4323,8 @@ fn emit_implicit_conversion( } (TypeKind::Scalar, TypeKind::Scalar, ConversionKind::SignExtend) => { let result_type = map.get_or_add(builder, SpirvType::from(cv.to.clone())); - builder.s_convert(result_type , Some(cv.dst), cv.src)?; - }, + builder.s_convert(result_type, Some(cv.dst), cv.src)?; + } (TypeKind::Vector, TypeKind::Scalar, ConversionKind::Default) | (TypeKind::Scalar, TypeKind::Array, ConversionKind::Default) | (TypeKind::Array, TypeKind::Scalar, ConversionKind::Default) => {