From 50cfd16a0626116fa5b7380e422aa34d2a68e70b Mon Sep 17 00:00:00 2001 From: Andrzej Janik Date: Thu, 5 Dec 2024 04:33:43 +0100 Subject: [PATCH] Minor fixes requried by geekbench --- cuda_base/src/lib.rs | 1 + ptx/lib/zluda_ptx_impl.bc | Bin 4816 -> 5360 bytes ptx/lib/zluda_ptx_impl.cpp | 17 +++++++--- ptx/src/pass/emit_llvm.rs | 29 ++++++++++++++++-- ...eplace_instructions_with_function_calls.rs | 3 ++ zluda/src/impl/memory.rs | 5 +++ zluda/src/lib.rs | 1 + zluda_inject/Cargo.toml | 2 +- zluda_inject/build.rs | 3 ++ zluda_inject/tests/helpers/do_cuinit_early.rs | 2 +- 10 files changed, 53 insertions(+), 10 deletions(-) diff --git a/cuda_base/src/lib.rs b/cuda_base/src/lib.rs index 833d372..e7a9677 100644 --- a/cuda_base/src/lib.rs +++ b/cuda_base/src/lib.rs @@ -193,6 +193,7 @@ fn join(fn_: Vec, find_module: bool) -> Punctuated { "func" => &["function"], "mem" => &["memory"], "memcpy" => &["memory", "copy"], + "memset" => &["memory", "set"], _ => return None, }) } diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc index 4b5a5d8219ac46a8c19cabf67a36dea6f3cbe886..24c20d8b1bd94be0ef590b36498b2ddb325a1a31 100644 GIT binary patch delta 1217 zcmb7>Z%kWN7{;G-Z+q{hw53HH-MB5cgE>ehsdH2|KwB`RYGG>pfEv@@6|}fA?+Pn_ z;=*lVx$}#*xoC`8Vw+&*zeGq)G||Ksh(RRE&@5X+7F0Auzc4=$W30LS5x#o9ywCfd zljl6=cb1QB)+cMp@2O+w$)QxLntYg=agFKHTyhd-ouA+-8*l?a)rxbgSS%JvH=V1% zBUnRHewd38VIjk>{$yXz*k|MRLXKa{mP>3t1uXjpNl*%6K% zmX$$C83>QZICj*-%8vDnW045I#m5}8B7ZYuTJt*Y#2t$wzoA`Vn$I|v^8BLLQ3yI# zyw!^&M+fI>{?=0AVIi%`<_ze~QK87;YlTMeEq1VRj^}NI}KzVu$~W&i|RW z&t~i^B7f_EhB`tzyFE7IF;rb`hfk#AM$|`q%DR@B-Q~DNd#JyJX}$bEbE;j`GaY&U zEzPT1ZiJs~(lDcBgHG^^dHWr&qu}IkWlT%NQApco4O5Uf_Wz_R<+rHa7xx)NR@HFt44X~;=C|6}$ z&dq z(n@k%JzZL**ZPdsZ>yA=Ds5x#P6)OWx7FLF@9EESiF`sGH=d=)74=7Bgt1-`Y)utv zhw1e(&04o@olnMtTn(eMB!a?W!$kow0yt^59G##(Vtk(SGV=v&EI}Cb_z$XC8=}Ae8t)a0}00Y1eK%>{abJV>pYyjS-b~Y+0 zzzU#_0?GmScDYVddw>C3w%_~%aUTY-LIAC5e{HD+{vg1;*st20I{{$V7L?jy>3Zl4 z1(*Wp$G)ReNrHFsjnx{phFKp2cYA?6Z?K^IFqjPoLv&m z7+T>@6BwnV1){&sla9j8dOabOO-nLclv*6aB?`n9Km^brgs8h$GI-c;$tik-Qt&Aw z#>VQE{dY~XfU$?Jq!jtcCpK+d5F`4T(kMG@8sIik?6xUEf2I`KZ(~195AR-8fO;*L z$07RpcQ_1MqwW;ywMAum0m~X;2dfXcijCaKgZ*8v$o41EK;4ed9uT%g4 diff --git a/ptx/lib/zluda_ptx_impl.cpp b/ptx/lib/zluda_ptx_impl.cpp index f86a7fd..329a810 100644 --- a/ptx/lib/zluda_ptx_impl.cpp +++ b/ptx/lib/zluda_ptx_impl.cpp @@ -1,5 +1,6 @@ -// Every time this file changes it must te rebuilt, you need `rocm-llvm-dev` and `llvm-17`: -// /opt/rocm/llvm/bin/clang -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -nogpulib -O3 -mno-wavefrontsize64 -o zluda_ptx_impl.bc -emit-llvm -c --offload-device-only --offload-arch=gfx1010 && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc -o - | sed '/@llvm.used/d' | sed '/wchar_size/d' | sed '/llvm.module.flags/d' | sed 's/define hidden/define linkonce_odr/g' | sed 's/\"target-cpu\"=\"gfx1010\"//g' | sed -E 's/\"target-features\"=\"[^\"]+\"//g' | sed 's/ nneg / /g' | sed 's/ disjoint / /g' | llvm-as-17 - -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc +// Every time this file changes it must te rebuilt, you need `rocm-llvm-dev` and `llvm-17` +// `fdenormal-fp-math=dynamic` is required to make functions eligible for inlining +// /opt/rocm/llvm/bin/clang -Xclang -fdenormal-fp-math=dynamic -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -nogpulib -O3 -mno-wavefrontsize64 -o zluda_ptx_impl.bc -emit-llvm -c --offload-device-only --offload-arch=gfx1010 && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc -o - | sed '/@llvm.used/d' | sed '/wchar_size/d' | sed '/llvm.module.flags/d' | sed 's/define hidden/define linkonce_odr/g' | sed 's/\"target-cpu\"=\"gfx1010\"//g' | sed -E 's/\"target-features\"=\"[^\"]+\"//g' | sed 's/ nneg / /g' | sed 's/ disjoint / /g' | llvm-as-17 - -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc #include #include @@ -37,7 +38,7 @@ extern "C" return (uint32_t)__ockl_get_num_groups(member); } - uint32_t __ockl_bfe_u32(uint32_t, uint32_t, uint32_t) __attribute__((device)); + uint32_t __ockl_bfe_u32(uint32_t, uint32_t, uint32_t) __device__; uint32_t FUNC(bfe_u32)(uint32_t base, uint32_t pos_32, uint32_t len_32) { uint32_t pos = pos_32 & 0xFFU; @@ -65,7 +66,7 @@ extern "C" return (base >> pos) & ((1UL << len) - 1UL); } - int32_t __ockl_bfe_i32(int32_t, uint32_t, uint32_t) __attribute__((device)); + int32_t __ockl_bfe_i32(int32_t, uint32_t, uint32_t) __device__; int32_t FUNC(bfe_s32)(int32_t base, uint32_t pos_32, uint32_t len_32) { uint32_t pos = pos_32 & 0xFFU; @@ -120,7 +121,7 @@ extern "C" return (base << (64U - pos - len)) >> (64U - len); } - uint32_t __ockl_bfm_u32(uint32_t count, uint32_t offset) __attribute__((device)); + uint32_t __ockl_bfm_u32(uint32_t count, uint32_t offset) __device__; uint32_t FUNC(bfi_b32)(uint32_t insert, uint32_t base, uint32_t pos_32, uint32_t len_32) { uint32_t pos = pos_32 & 0xFFU; @@ -148,4 +149,10 @@ extern "C" mask = ((1UL << len) - 1UL) << (pos); return (~mask & base) | (mask & (insert << pos)); } + + void FUNC(bar_sync)(uint32_t) + { + __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup"); + __builtin_amdgcn_s_barrier(); + } } diff --git a/ptx/src/pass/emit_llvm.rs b/ptx/src/pass/emit_llvm.rs index 739e53d..71eb03c 100644 --- a/ptx/src/pass/emit_llvm.rs +++ b/ptx/src/pass/emit_llvm.rs @@ -534,7 +534,6 @@ impl<'a> MethodEmitContext<'a> { ast::Instruction::Sqrt { data, arguments } => self.emit_sqrt(data, arguments), ast::Instruction::Rsqrt { data, arguments } => self.emit_rsqrt(data, arguments), ast::Instruction::Selp { data, arguments } => self.emit_selp(data, arguments), - ast::Instruction::Bar { .. } => todo!(), ast::Instruction::Atom { data, arguments } => self.emit_atom(data, arguments), ast::Instruction::AtomCas { data, arguments } => self.emit_atom_cas(data, arguments), ast::Instruction::Div { data, arguments } => self.emit_div(data, arguments), @@ -554,6 +553,7 @@ impl<'a> MethodEmitContext<'a> { ast::Instruction::Trap {} => todo!(), // replaced by a function call ast::Instruction::Bfe { .. } + | ast::Instruction::Bar { .. } | ast::Instruction::Bfi { .. } | ast::Instruction::Activemask { .. } => return Err(error_unreachable()), } @@ -1565,8 +1565,12 @@ impl<'a> MethodEmitContext<'a> { Some(LLVMBuildFPToUI), ) } - ptx_parser::CvtMode::FPFromSigned(_) => todo!(), - ptx_parser::CvtMode::FPFromUnsigned(_) => todo!(), + ptx_parser::CvtMode::FPFromSigned(_) => { + return self.emit_cvt_int_to_float(data.to, arguments, LLVMBuildSIToFP) + } + ptx_parser::CvtMode::FPFromUnsigned(_) => { + return self.emit_cvt_int_to_float(data.to, arguments, LLVMBuildUIToFP) + } }; let src = self.resolver.value(arguments.src)?; self.resolver.with_result(arguments.dst, |dst| unsafe { @@ -1721,6 +1725,25 @@ impl<'a> MethodEmitContext<'a> { Ok(()) } + fn emit_cvt_int_to_float( + &mut self, + to: ptx_parser::ScalarType, + arguments: ptx_parser::CvtArgs, + llvm_func: unsafe extern "C" fn( + arg1: LLVMBuilderRef, + Val: LLVMValueRef, + DestTy: LLVMTypeRef, + Name: *const i8, + ) -> LLVMValueRef, + ) -> Result<(), TranslateError> { + let type_ = get_scalar_type(self.context, to); + let src = self.resolver.value(arguments.src)?; + self.resolver.with_result(arguments.dst, |dst| unsafe { + llvm_func(self.builder, src, type_, dst) + }); + Ok(()) + } + fn emit_rsqrt( &mut self, data: ptx_parser::TypeFtz, diff --git a/ptx/src/pass/replace_instructions_with_function_calls.rs b/ptx/src/pass/replace_instructions_with_function_calls.rs index 70d77d3..668cc21 100644 --- a/ptx/src/pass/replace_instructions_with_function_calls.rs +++ b/ptx/src/pass/replace_instructions_with_function_calls.rs @@ -104,6 +104,9 @@ fn run_instruction<'input>( let name = ["bfi_", scalar_to_ptx_name(data)].concat(); to_call(resolver, fn_declarations, name.into(), i)? } + i @ ptx_parser::Instruction::Bar { .. } => { + to_call(resolver, fn_declarations, "bar_sync".into(), i)? + } i => i, }) } diff --git a/zluda/src/impl/memory.rs b/zluda/src/impl/memory.rs index 3843776..33d5a4e 100644 --- a/zluda/src/impl/memory.rs +++ b/zluda/src/impl/memory.rs @@ -1,4 +1,5 @@ use hip_runtime_sys::*; +use std::mem; pub(crate) fn alloc_v2(dptr: *mut hipDeviceptr_t, bytesize: usize) -> hipError_t { unsafe { hipMalloc(dptr.cast(), bytesize) }?; @@ -33,3 +34,7 @@ pub(crate) fn get_address_range_v2( ) -> hipError_t { unsafe { hipMemGetAddressRange(pbase, psize, dptr) } } + +pub(crate) fn set_d32_v2(dst: hipDeviceptr_t, ui: ::core::ffi::c_uint, n: usize) -> hipError_t { + unsafe { hipMemsetD32(dst, mem::transmute(ui), n) } +} diff --git a/zluda/src/lib.rs b/zluda/src/lib.rs index 1568f47..1f6a7ff 100644 --- a/zluda/src/lib.rs +++ b/zluda/src/lib.rs @@ -72,6 +72,7 @@ cuda_base::cuda_function_declarations!( cuModuleUnload, cuPointerGetAttribute, cuMemGetAddressRange_v2, + cuMemsetD32_v2, ], implemented_in_function <= [ cuLaunchKernel, diff --git a/zluda_inject/Cargo.toml b/zluda_inject/Cargo.toml index 65113a4..20e2e2d 100644 --- a/zluda_inject/Cargo.toml +++ b/zluda_inject/Cargo.toml @@ -9,7 +9,7 @@ name = "zluda_with" path = "src/main.rs" [target.'cfg(windows)'.dependencies] -winapi = { version = "0.3", features = ["jobapi2", "processthreadsapi", "synchapi", "winbase", "std"] } +winapi = { version = "0.3", features = ["jobapi2", "processthreadsapi", "synchapi", "winbase", "std", "processenv"] } tempfile = "3" argh = "0.1" detours-sys = { path = "../detours-sys" } diff --git a/zluda_inject/build.rs b/zluda_inject/build.rs index ccce573..c79d2d2 100644 --- a/zluda_inject/build.rs +++ b/zluda_inject/build.rs @@ -7,6 +7,9 @@ use std::{ }; fn main() -> Result<(), VarError> { + if std::env::var_os("CARGO_CFG_WINDOWS").is_none() { + return Ok(()); + } println!("cargo:rerun-if-changed=build.rs"); if env::var("PROFILE")? != "debug" { return Ok(()); diff --git a/zluda_inject/tests/helpers/do_cuinit_early.rs b/zluda_inject/tests/helpers/do_cuinit_early.rs index 9743f4a..7d10855 100644 --- a/zluda_inject/tests/helpers/do_cuinit_early.rs +++ b/zluda_inject/tests/helpers/do_cuinit_early.rs @@ -1,6 +1,6 @@ #![crate_type = "bin"] -#[link(name = "do_cuinit")] +#[link(name = "do_cuinit", kind = "raw-dylib")] extern "system" { fn do_cuinit(flags: u32) -> u32; }