diff --git a/Cargo.lock b/Cargo.lock index e62be8e..47ed2e2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -270,24 +270,6 @@ dependencies = [ "unicode-segmentation", ] -[[package]] -name = "cuda-config" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ee74643f7430213a1a78320f88649de309b20b80818325575e393f848f79f5d" -dependencies = [ - "glob", -] - -[[package]] -name = "cuda-driver-sys" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d4c552cc0de854877d80bcd1f11db75d42be32962d72a6799b88dcca88fffbd" -dependencies = [ - "cuda-config", -] - [[package]] name = "cuda_base" version = "0.0.0" @@ -938,10 +920,12 @@ dependencies = [ "bit-vec 0.6.3", "bitflags 1.3.2", "comgr", - "cuda-driver-sys", + "cuda_base", + "cuda_types", "half", "hip_runtime-sys", "int-enum", + "libloading", "llvm_zluda", "microlp", "paste", diff --git a/comgr/Cargo.toml b/comgr/Cargo.toml index 2e2cb72..171002e 100644 --- a/comgr/Cargo.toml +++ b/comgr/Cargo.toml @@ -8,4 +8,4 @@ edition = "2021" [dependencies] amd_comgr-sys = { path = "../ext/amd_comgr-sys" } -libloading = "0.8" \ No newline at end of file +libloading = "0.8" diff --git a/cuda_base/src/lib.rs b/cuda_base/src/lib.rs index e698baa..0f40843 100644 --- a/cuda_base/src/lib.rs +++ b/cuda_base/src/lib.rs @@ -199,7 +199,7 @@ impl VisitMut for FixFnSignatures { } const MODULES: &[&str] = &[ - "context", "device", "driver", "function", "link", "memory", "module", "pointer", + "context", "device", "driver", "function", "link", "memory", "module", "pointer", "stream", ]; #[proc_macro] diff --git a/ptx/Cargo.toml b/ptx/Cargo.toml index 2876539..c22ae1b 100644 --- a/ptx/Cargo.toml +++ b/ptx/Cargo.toml @@ -25,7 +25,9 @@ unwrap_or = "1.0.1" [dev-dependencies] hip_runtime-sys = { path = "../ext/hip_runtime-sys" } comgr = { path = "../comgr" } +cuda_types = { path = "../cuda_types" } +cuda_base = { path = "../cuda_base" } tempfile = "3" paste = "1.0" -cuda-driver-sys = "0.3.0" -pretty_assertions = "1.4.1" \ No newline at end of file +pretty_assertions = "1.4.1" +libloading = "0.8" diff --git a/ptx/src/test/ll/mad_s32.ll b/ptx/src/test/ll/mad_s32.ll index 37db9d3..f0512a6 100644 --- a/ptx/src/test/ll/mad_s32.ll +++ b/ptx/src/test/ll/mad_s32.ll @@ -1,54 +1,44 @@ -define amdgpu_kernel void @mad_s32(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 { - %"43" = alloca i64, align 8, addrspace(5) - %"44" = alloca i64, align 8, addrspace(5) - %"45" = alloca i32, align 4, addrspace(5) - %"46" = alloca i32, align 4, addrspace(5) - %"47" = alloca i32, align 4, addrspace(5) - %"48" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @mad_s32(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i32, align 4, addrspace(5) + %"42" = alloca i32, align 4, addrspace(5) + %"43" = alloca i32, align 4, addrspace(5) + %"44" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - br label %"40" + br label %"36" -"40": ; preds = %1 - %"49" = load i64, ptr addrspace(4) %"41", align 4 - store i64 %"49", ptr addrspace(5) %"43", align 4 - %"50" = load i64, ptr addrspace(4) %"42", align 4 - store i64 %"50", ptr addrspace(5) %"44", align 4 - %"52" = load i64, ptr addrspace(5) %"43", align 4 - %"67" = inttoptr i64 %"52" to ptr - %"51" = load i32, ptr %"67", align 4 - store i32 %"51", ptr addrspace(5) %"46", align 4 - %"53" = load i64, ptr addrspace(5) %"43", align 4 - %"68" = inttoptr i64 %"53" to ptr - %"33" = getelementptr inbounds i8, ptr %"68", i64 4 - %"54" = load i32, ptr %"33", align 4 - store i32 %"54", ptr addrspace(5) %"47", align 4 - %"55" = load i64, ptr addrspace(5) %"43", align 4 - %"69" = inttoptr i64 %"55" to ptr - %"35" = getelementptr inbounds i8, ptr %"69", i64 8 - %"56" = load i32, ptr %"35", align 4 - store i32 %"56", ptr addrspace(5) %"48", align 4 - %"58" = load i32, ptr addrspace(5) %"46", align 4 - %"59" = load i32, ptr addrspace(5) %"47", align 4 - %"60" = load i32, ptr addrspace(5) %"48", align 4 - %2 = mul i32 %"58", %"59" - %"57" = add i32 %2, %"60" - store i32 %"57", ptr addrspace(5) %"45", align 4 - %"61" = load i64, ptr addrspace(5) %"44", align 4 - %"62" = load i32, ptr addrspace(5) %"45", align 4 - %"70" = inttoptr i64 %"61" to ptr - store i32 %"62", ptr %"70", align 4 - %"63" = load i64, ptr addrspace(5) %"44", align 4 - %"71" = inttoptr i64 %"63" to ptr - %"37" = getelementptr inbounds i8, ptr %"71", i64 4 - %"64" = load i32, ptr addrspace(5) %"45", align 4 - store i32 %"64", ptr %"37", align 4 - %"65" = load i64, ptr addrspace(5) %"44", align 4 - %"72" = inttoptr i64 %"65" to ptr - %"39" = getelementptr inbounds i8, ptr %"72", i64 8 - %"66" = load i32, ptr addrspace(5) %"45", align 4 - store i32 %"66", ptr %"39", align 4 +"36": ; preds = %1 + %"45" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"45", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"46", ptr addrspace(5) %"40", align 4 + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %"59" = inttoptr i64 %"48" to ptr + %"47" = load i32, ptr %"59", align 4 + store i32 %"47", ptr addrspace(5) %"42", align 4 + %"49" = load i64, ptr addrspace(5) %"39", align 4 + %"60" = inttoptr i64 %"49" to ptr + %"33" = getelementptr inbounds i8, ptr %"60", i64 4 + %"50" = load i32, ptr %"33", align 4 + store i32 %"50", ptr addrspace(5) %"43", align 4 + %"51" = load i64, ptr addrspace(5) %"39", align 4 + %"61" = inttoptr i64 %"51" to ptr + %"35" = getelementptr inbounds i8, ptr %"61", i64 8 + %"52" = load i32, ptr %"35", align 4 + store i32 %"52", ptr addrspace(5) %"44", align 4 + %"54" = load i32, ptr addrspace(5) %"42", align 4 + %"55" = load i32, ptr addrspace(5) %"43", align 4 + %"56" = load i32, ptr addrspace(5) %"44", align 4 + %2 = mul i32 %"54", %"55" + %"53" = add i32 %2, %"56" + store i32 %"53", ptr addrspace(5) %"41", align 4 + %"57" = load i64, ptr addrspace(5) %"40", align 4 + %"58" = load i32, ptr addrspace(5) %"41", align 4 + %"62" = inttoptr i64 %"57" to ptr + store i32 %"58", ptr %"62", align 4 ret void } diff --git a/ptx/src/test/ll/mad_wide.ll b/ptx/src/test/ll/mad_wide.ll new file mode 100644 index 0000000..f44ab2f --- /dev/null +++ b/ptx/src/test/ll/mad_wide.ll @@ -0,0 +1,47 @@ +define amdgpu_kernel void @mad_wide(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i32, align 4, addrspace(5) + %"43" = alloca i32, align 4, addrspace(5) + %"44" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + br label %"36" + +"36": ; preds = %1 + %"45" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"45", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"46", ptr addrspace(5) %"40", align 4 + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %"59" = inttoptr i64 %"48" to ptr + %"47" = load i32, ptr %"59", align 4 + store i32 %"47", ptr addrspace(5) %"42", align 4 + %"49" = load i64, ptr addrspace(5) %"39", align 4 + %"60" = inttoptr i64 %"49" to ptr + %"33" = getelementptr inbounds i8, ptr %"60", i64 4 + %"50" = load i32, ptr %"33", align 4 + store i32 %"50", ptr addrspace(5) %"43", align 4 + %"51" = load i64, ptr addrspace(5) %"39", align 4 + %"61" = inttoptr i64 %"51" to ptr + %"35" = getelementptr inbounds i8, ptr %"61", i64 8 + %"52" = load i64, ptr %"35", align 4 + store i64 %"52", ptr addrspace(5) %"44", align 4 + %"54" = load i32, ptr addrspace(5) %"42", align 4 + %"55" = load i32, ptr addrspace(5) %"43", align 4 + %"56" = load i64, ptr addrspace(5) %"44", align 4 + %2 = sext i32 %"54" to i64 + %3 = sext i32 %"55" to i64 + %4 = mul i64 %2, %3 + %"53" = add i64 %4, %"56" + store i64 %"53", ptr addrspace(5) %"41", align 4 + %"57" = load i64, ptr addrspace(5) %"40", align 4 + %"58" = load i64, ptr addrspace(5) %"41", align 4 + %"62" = inttoptr i64 %"57" to ptr + store i64 %"58", ptr %"62", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/spirv_run/mad_s32.ptx b/ptx/src/test/spirv_run/mad_s32.ptx index a864266..83c2896 100644 --- a/ptx/src/test/spirv_run/mad_s32.ptx +++ b/ptx/src/test/spirv_run/mad_s32.ptx @@ -22,7 +22,5 @@ ld.s32 src3, [in_addr+8]; mad.lo.s32 dst, src1, src2, src3; st.s32 [out_addr], dst; - st.s32 [out_addr+4], dst; - st.s32 [out_addr+8], dst; ret; } diff --git a/ptx/src/test/spirv_run/mad_wide.ptx b/ptx/src/test/spirv_run/mad_wide.ptx new file mode 100644 index 0000000..a68ebfe --- /dev/null +++ b/ptx/src/test/spirv_run/mad_wide.ptx @@ -0,0 +1,27 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry mad_wide( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + + .reg .s64 dst; + .reg .s32 src1; + .reg .s32 src2; + .reg .s64 src3; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.s32 src1, [in_addr]; + ld.s32 src2, [in_addr+4]; + ld.s64 src3, [in_addr+8]; + mad.wide.s32 dst, src1, src2, src3; + st.s64 [out_addr], dst; + ret; +} diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs index bca910b..f2c8ffa 100644 --- a/ptx/src/test/spirv_run/mod.rs +++ b/ptx/src/test/spirv_run/mod.rs @@ -1,5 +1,6 @@ use crate::pass; use comgr::Comgr; +use cuda_types::cuda::CUstream; use hip_runtime_sys::hipError_t; use pretty_assertions; use std::env; @@ -95,7 +96,8 @@ test_ptx!(b64tof64, [111u64], [111u64]); // This segfaults NV compiler // test_ptx!(implicit_param, [34u32], [34u32]); test_ptx!(pred_not, [10u64, 11u64], [2u64, 0u64]); -test_ptx!(mad_s32, [2i32, 3i32, 4i32], [10i32, 10i32, 10i32]); +test_ptx!(mad_s32, [2i32, 3i32, 4i32], [10i32]); +test_ptx!(mad_wide, [-1i32, 3, 4, 5], [21474836481i64]); test_ptx!( mul_wide, [0x01_00_00_00__01_00_00_00i64], @@ -313,60 +315,126 @@ fn test_cuda_assert< output: &mut [Output], ) -> Result<(), Box> { let name = CString::new(name)?; - let result = - run_cuda(name.as_c_str(), ptx_text, input, output).map_err(|err| DisplayError { err })?; + let result = run_cuda(name.as_c_str(), ptx_text, input, output); assert_eq!(result.as_slice(), output); Ok(()) } -macro_rules! cuda_call { - ($expr:expr) => { - #[allow(unused_unsafe)] - { - let err = unsafe { $expr }; - if err != cuda_driver_sys::CUresult::CUDA_SUCCESS { - return Result::Err(err); - } - } - }; -} - fn run_cuda + Copy + Debug, Output: From + Copy + Debug + Default>( name: &CStr, ptx_module: &str, input: &[Input], output: &mut [Output], -) -> Result, cuda_driver_sys::CUresult> { - use cuda_driver_sys::*; - cuda_call! { cuInit(0) }; +) -> Vec { + unsafe { CUDA.cuInit(0) }.unwrap().unwrap(); let ptx_module = CString::new(ptx_module).unwrap(); let mut result = vec![0u8.into(); output.len()]; { - let mut ctx = ptr::null_mut(); - cuda_call! { cuCtxCreate_v2(&mut ctx, 0, 0) }; - let mut module = ptr::null_mut(); - cuda_call! { cuModuleLoadData(&mut module, ptx_module.as_ptr() as _) }; - let mut kernel = ptr::null_mut(); - cuda_call! { cuModuleGetFunction(&mut kernel, module, name.as_ptr()) }; + let mut ctx = unsafe { mem::zeroed() }; + unsafe { CUDA.cuCtxCreate_v2(&mut ctx, 0, 0) } + .unwrap() + .unwrap(); + let mut module = unsafe { mem::zeroed() }; + unsafe { CUDA.cuModuleLoadData(&mut module, ptx_module.as_ptr() as _) } + .unwrap() + .unwrap(); + let mut kernel = unsafe { mem::zeroed() }; + unsafe { CUDA.cuModuleGetFunction(&mut kernel, module, name.as_ptr()) } + .unwrap() + .unwrap(); let mut inp_b = unsafe { mem::zeroed() }; - cuda_call! { cuMemAlloc_v2(&mut inp_b, input.len() * mem::size_of::()) }; + unsafe { CUDA.cuMemAlloc_v2(&mut inp_b, input.len() * mem::size_of::()) } + .unwrap() + .unwrap(); let mut out_b = unsafe { mem::zeroed() }; - cuda_call! { cuMemAlloc_v2(&mut out_b, output.len() * mem::size_of::()) }; - cuda_call! { cuMemcpyHtoD_v2(inp_b, input.as_ptr() as _, input.len() * mem::size_of::()) }; - cuda_call! { cuMemsetD8_v2(out_b, 0, output.len() * mem::size_of::()) }; + unsafe { CUDA.cuMemAlloc_v2(&mut out_b, output.len() * mem::size_of::()) } + .unwrap() + .unwrap(); + unsafe { + CUDA.cuMemcpyHtoD_v2( + inp_b, + input.as_ptr() as _, + input.len() * mem::size_of::(), + ) + } + .unwrap() + .unwrap(); + unsafe { CUDA.cuMemsetD8_v2(out_b, 0, output.len() * mem::size_of::()) } + .unwrap() + .unwrap(); let mut args = [&inp_b, &out_b]; - cuda_call! { cuLaunchKernel(kernel, 1,1,1,1,1,1, 1024, 0 as _, args.as_mut_ptr() as _, ptr::null_mut()) }; - cuda_call! { cuMemcpyDtoH_v2(result.as_mut_ptr() as _, out_b, output.len() * mem::size_of::()) }; - cuda_call! { cuStreamSynchronize(0 as _) }; - cuda_call! { cuMemFree_v2(inp_b) }; - cuda_call! { cuMemFree_v2(out_b) }; - cuda_call! { cuModuleUnload(module) }; - cuda_call! { cuCtxDestroy_v2(ctx) }; + unsafe { + CUDA.cuLaunchKernel( + kernel, + 1, + 1, + 1, + 1, + 1, + 1, + 1024, + CUstream(ptr::null_mut()), + args.as_mut_ptr() as _, + ptr::null_mut(), + ) + } + .unwrap() + .unwrap(); + unsafe { + CUDA.cuMemcpyDtoH_v2( + result.as_mut_ptr() as _, + out_b, + output.len() * mem::size_of::(), + ) + } + .unwrap() + .unwrap(); + unsafe { CUDA.cuStreamSynchronize(CUstream(ptr::null_mut())) } + .unwrap() + .unwrap(); + unsafe { CUDA.cuMemFree_v2(inp_b) }.unwrap().unwrap(); + unsafe { CUDA.cuMemFree_v2(out_b) }.unwrap().unwrap(); + unsafe { CUDA.cuModuleUnload(module) }.unwrap().unwrap(); + unsafe { CUDA.cuCtxDestroy_v2(ctx) }.unwrap().unwrap(); } - Ok(result) + result } +struct DynamicCuda { + lib: libloading::Library, +} + +impl DynamicCuda { + #[cfg(not(windows))] + const CUDA_PATH: &'static str = "/usr/lib/x86_64-linux-gnu/libcuda.so.1"; + #[cfg(windows)] + const CUDA_PATH: &'static str = "C:\\Windows\\System32\\nvcuda.dll"; + + pub fn new() -> Result { + let lib = unsafe { libloading::Library::new(Self::CUDA_PATH) }?; + Ok(Self { lib }) + } +} + +macro_rules! dynamic_fns { + ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:ty;)*) => { + impl DynamicCuda { + $( + #[allow(dead_code)] + unsafe fn $fn_name(&self, $($arg_id : $arg_type),*) -> Result<$ret_type, libloading::Error> { + let func = unsafe { self.lib.get:: $ret_type>(concat!(stringify!($fn_name), "\0").as_bytes()) }; + func.map(|f| f($($arg_id),*) ) + } + )* + } + }; +} + +cuda_base::cuda_function_declarations!(dynamic_fns); + static COMGR: std::sync::LazyLock = std::sync::LazyLock::new(|| Comgr::new().unwrap()); +static CUDA: std::sync::LazyLock = + std::sync::LazyLock::new(|| DynamicCuda::new().unwrap()); fn run_hip + Copy + Debug, Output: From + Copy + Debug + Default>( name: &CStr, diff --git a/ptx_parser/src/ast.rs b/ptx_parser/src/ast.rs index e4c3c87..4e2502d 100644 --- a/ptx_parser/src/ast.rs +++ b/ptx_parser/src/ast.rs @@ -251,7 +251,10 @@ ptx_parser_macros::generate_instruction_type!( }, src1: T, src2: T, - src3: T, + src3: { + repr: T, + type: { Type::from(data.dst_type()) }, + } } }, Max { diff --git a/zluda/src/impl/memory.rs b/zluda/src/impl/memory.rs index 18e58e7..ff7a72b 100644 --- a/zluda/src/impl/memory.rs +++ b/zluda/src/impl/memory.rs @@ -42,3 +42,6 @@ pub(crate) fn set_d32_v2(dst: hipDeviceptr_t, ui: ::core::ffi::c_uint, n: usize) pub(crate) fn set_d8_v2(dst: hipDeviceptr_t, value: ::core::ffi::c_uchar, n: usize) -> hipError_t { unsafe { hipMemsetD8(dst, value, n) } } +pub(crate) fn get_info_v2(free: *mut usize, total: *mut usize) -> hipError_t { + unsafe { hipMemGetInfo(free, total) } +} diff --git a/zluda/src/impl/mod.rs b/zluda/src/impl/mod.rs index 4d8bc83..db51ffb 100644 --- a/zluda/src/impl/mod.rs +++ b/zluda/src/impl/mod.rs @@ -9,6 +9,7 @@ pub(super) mod function; pub(super) mod memory; pub(super) mod module; pub(super) mod pointer; +pub(super) mod stream; #[cfg(debug_assertions)] pub(crate) fn unimplemented() -> CUresult { diff --git a/zluda/src/impl/stream.rs b/zluda/src/impl/stream.rs new file mode 100644 index 0000000..2a07adc --- /dev/null +++ b/zluda/src/impl/stream.rs @@ -0,0 +1,5 @@ +use hip_runtime_sys::*; + +pub(crate) fn synchronize(stream: hipStream_t) -> hipError_t { + unsafe { hipStreamSynchronize(stream) } +} diff --git a/zluda/src/lib.rs b/zluda/src/lib.rs index e058bd7..9b94032 100644 --- a/zluda/src/lib.rs +++ b/zluda/src/lib.rs @@ -54,26 +54,28 @@ cuda_base::cuda_function_declarations!( cuDeviceGetCount, cuDeviceGetLuid, cuDeviceGetName, - cuDevicePrimaryCtxRelease, - cuDevicePrimaryCtxRetain, cuDeviceGetProperties, cuDeviceGetUuid, cuDeviceGetUuid_v2, + cuDevicePrimaryCtxRelease, + cuDevicePrimaryCtxRetain, cuDeviceTotalMem_v2, cuDriverGetVersion, cuFuncGetAttribute, cuInit, cuMemAlloc_v2, cuMemFree_v2, + cuMemGetAddressRange_v2, + cuMemGetInfo_v2, cuMemcpyDtoH_v2, cuMemcpyHtoD_v2, + cuMemsetD32_v2, + cuMemsetD8_v2, cuModuleGetFunction, cuModuleLoadData, cuModuleUnload, cuPointerGetAttribute, - cuMemGetAddressRange_v2, - cuMemsetD32_v2, - cuMemsetD8_v2 + cuStreamSynchronize ], implemented_in_function <= [ cuLaunchKernel,