Fix mad.wide, replace external CUDA library in test with our own (#376)

This commit is contained in:
Andrzej Janik
2025-06-09 21:33:18 -07:00
committed by GitHub
parent c790ab45ec
commit 3361046760
14 changed files with 244 additions and 114 deletions

22
Cargo.lock generated
View File

@ -270,24 +270,6 @@ dependencies = [
"unicode-segmentation",
]
[[package]]
name = "cuda-config"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ee74643f7430213a1a78320f88649de309b20b80818325575e393f848f79f5d"
dependencies = [
"glob",
]
[[package]]
name = "cuda-driver-sys"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d4c552cc0de854877d80bcd1f11db75d42be32962d72a6799b88dcca88fffbd"
dependencies = [
"cuda-config",
]
[[package]]
name = "cuda_base"
version = "0.0.0"
@ -938,10 +920,12 @@ dependencies = [
"bit-vec 0.6.3",
"bitflags 1.3.2",
"comgr",
"cuda-driver-sys",
"cuda_base",
"cuda_types",
"half",
"hip_runtime-sys",
"int-enum",
"libloading",
"llvm_zluda",
"microlp",
"paste",

View File

@ -199,7 +199,7 @@ impl VisitMut for FixFnSignatures {
}
const MODULES: &[&str] = &[
"context", "device", "driver", "function", "link", "memory", "module", "pointer",
"context", "device", "driver", "function", "link", "memory", "module", "pointer", "stream",
];
#[proc_macro]

View File

@ -25,7 +25,9 @@ unwrap_or = "1.0.1"
[dev-dependencies]
hip_runtime-sys = { path = "../ext/hip_runtime-sys" }
comgr = { path = "../comgr" }
cuda_types = { path = "../cuda_types" }
cuda_base = { path = "../cuda_base" }
tempfile = "3"
paste = "1.0"
cuda-driver-sys = "0.3.0"
pretty_assertions = "1.4.1"
libloading = "0.8"

View File

@ -1,54 +1,44 @@
define amdgpu_kernel void @mad_s32(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 {
%"43" = alloca i64, align 8, addrspace(5)
%"44" = alloca i64, align 8, addrspace(5)
%"45" = alloca i32, align 4, addrspace(5)
%"46" = alloca i32, align 4, addrspace(5)
%"47" = alloca i32, align 4, addrspace(5)
%"48" = alloca i32, align 4, addrspace(5)
define amdgpu_kernel void @mad_s32(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
%"39" = alloca i64, align 8, addrspace(5)
%"40" = alloca i64, align 8, addrspace(5)
%"41" = alloca i32, align 4, addrspace(5)
%"42" = alloca i32, align 4, addrspace(5)
%"43" = alloca i32, align 4, addrspace(5)
%"44" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"40"
br label %"36"
"40": ; preds = %1
%"49" = load i64, ptr addrspace(4) %"41", align 4
store i64 %"49", ptr addrspace(5) %"43", align 4
%"50" = load i64, ptr addrspace(4) %"42", align 4
store i64 %"50", ptr addrspace(5) %"44", align 4
%"52" = load i64, ptr addrspace(5) %"43", align 4
%"67" = inttoptr i64 %"52" to ptr
%"51" = load i32, ptr %"67", align 4
store i32 %"51", ptr addrspace(5) %"46", align 4
%"53" = load i64, ptr addrspace(5) %"43", align 4
%"68" = inttoptr i64 %"53" to ptr
%"33" = getelementptr inbounds i8, ptr %"68", i64 4
%"54" = load i32, ptr %"33", align 4
store i32 %"54", ptr addrspace(5) %"47", align 4
%"55" = load i64, ptr addrspace(5) %"43", align 4
%"69" = inttoptr i64 %"55" to ptr
%"35" = getelementptr inbounds i8, ptr %"69", i64 8
%"56" = load i32, ptr %"35", align 4
store i32 %"56", ptr addrspace(5) %"48", align 4
%"58" = load i32, ptr addrspace(5) %"46", align 4
%"59" = load i32, ptr addrspace(5) %"47", align 4
%"60" = load i32, ptr addrspace(5) %"48", align 4
%2 = mul i32 %"58", %"59"
%"57" = add i32 %2, %"60"
store i32 %"57", ptr addrspace(5) %"45", align 4
%"61" = load i64, ptr addrspace(5) %"44", align 4
%"62" = load i32, ptr addrspace(5) %"45", align 4
%"70" = inttoptr i64 %"61" to ptr
store i32 %"62", ptr %"70", align 4
%"63" = load i64, ptr addrspace(5) %"44", align 4
%"71" = inttoptr i64 %"63" to ptr
%"37" = getelementptr inbounds i8, ptr %"71", i64 4
%"64" = load i32, ptr addrspace(5) %"45", align 4
store i32 %"64", ptr %"37", align 4
%"65" = load i64, ptr addrspace(5) %"44", align 4
%"72" = inttoptr i64 %"65" to ptr
%"39" = getelementptr inbounds i8, ptr %"72", i64 8
%"66" = load i32, ptr addrspace(5) %"45", align 4
store i32 %"66", ptr %"39", align 4
"36": ; preds = %1
%"45" = load i64, ptr addrspace(4) %"37", align 4
store i64 %"45", ptr addrspace(5) %"39", align 4
%"46" = load i64, ptr addrspace(4) %"38", align 4
store i64 %"46", ptr addrspace(5) %"40", align 4
%"48" = load i64, ptr addrspace(5) %"39", align 4
%"59" = inttoptr i64 %"48" to ptr
%"47" = load i32, ptr %"59", align 4
store i32 %"47", ptr addrspace(5) %"42", align 4
%"49" = load i64, ptr addrspace(5) %"39", align 4
%"60" = inttoptr i64 %"49" to ptr
%"33" = getelementptr inbounds i8, ptr %"60", i64 4
%"50" = load i32, ptr %"33", align 4
store i32 %"50", ptr addrspace(5) %"43", align 4
%"51" = load i64, ptr addrspace(5) %"39", align 4
%"61" = inttoptr i64 %"51" to ptr
%"35" = getelementptr inbounds i8, ptr %"61", i64 8
%"52" = load i32, ptr %"35", align 4
store i32 %"52", ptr addrspace(5) %"44", align 4
%"54" = load i32, ptr addrspace(5) %"42", align 4
%"55" = load i32, ptr addrspace(5) %"43", align 4
%"56" = load i32, ptr addrspace(5) %"44", align 4
%2 = mul i32 %"54", %"55"
%"53" = add i32 %2, %"56"
store i32 %"53", ptr addrspace(5) %"41", align 4
%"57" = load i64, ptr addrspace(5) %"40", align 4
%"58" = load i32, ptr addrspace(5) %"41", align 4
%"62" = inttoptr i64 %"57" to ptr
store i32 %"58", ptr %"62", align 4
ret void
}

View File

@ -0,0 +1,47 @@
define amdgpu_kernel void @mad_wide(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
%"39" = alloca i64, align 8, addrspace(5)
%"40" = alloca i64, align 8, addrspace(5)
%"41" = alloca i64, align 8, addrspace(5)
%"42" = alloca i32, align 4, addrspace(5)
%"43" = alloca i32, align 4, addrspace(5)
%"44" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"36"
"36": ; preds = %1
%"45" = load i64, ptr addrspace(4) %"37", align 4
store i64 %"45", ptr addrspace(5) %"39", align 4
%"46" = load i64, ptr addrspace(4) %"38", align 4
store i64 %"46", ptr addrspace(5) %"40", align 4
%"48" = load i64, ptr addrspace(5) %"39", align 4
%"59" = inttoptr i64 %"48" to ptr
%"47" = load i32, ptr %"59", align 4
store i32 %"47", ptr addrspace(5) %"42", align 4
%"49" = load i64, ptr addrspace(5) %"39", align 4
%"60" = inttoptr i64 %"49" to ptr
%"33" = getelementptr inbounds i8, ptr %"60", i64 4
%"50" = load i32, ptr %"33", align 4
store i32 %"50", ptr addrspace(5) %"43", align 4
%"51" = load i64, ptr addrspace(5) %"39", align 4
%"61" = inttoptr i64 %"51" to ptr
%"35" = getelementptr inbounds i8, ptr %"61", i64 8
%"52" = load i64, ptr %"35", align 4
store i64 %"52", ptr addrspace(5) %"44", align 4
%"54" = load i32, ptr addrspace(5) %"42", align 4
%"55" = load i32, ptr addrspace(5) %"43", align 4
%"56" = load i64, ptr addrspace(5) %"44", align 4
%2 = sext i32 %"54" to i64
%3 = sext i32 %"55" to i64
%4 = mul i64 %2, %3
%"53" = add i64 %4, %"56"
store i64 %"53", ptr addrspace(5) %"41", align 4
%"57" = load i64, ptr addrspace(5) %"40", align 4
%"58" = load i64, ptr addrspace(5) %"41", align 4
%"62" = inttoptr i64 %"57" to ptr
store i64 %"58", ptr %"62", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -22,7 +22,5 @@
ld.s32 src3, [in_addr+8];
mad.lo.s32 dst, src1, src2, src3;
st.s32 [out_addr], dst;
st.s32 [out_addr+4], dst;
st.s32 [out_addr+8], dst;
ret;
}

View File

@ -0,0 +1,27 @@
.version 6.5
.target sm_30
.address_size 64
.visible .entry mad_wide(
.param .u64 input,
.param .u64 output
)
{
.reg .u64 in_addr;
.reg .u64 out_addr;
.reg .s64 dst;
.reg .s32 src1;
.reg .s32 src2;
.reg .s64 src3;
ld.param.u64 in_addr, [input];
ld.param.u64 out_addr, [output];
ld.s32 src1, [in_addr];
ld.s32 src2, [in_addr+4];
ld.s64 src3, [in_addr+8];
mad.wide.s32 dst, src1, src2, src3;
st.s64 [out_addr], dst;
ret;
}

View File

@ -1,5 +1,6 @@
use crate::pass;
use comgr::Comgr;
use cuda_types::cuda::CUstream;
use hip_runtime_sys::hipError_t;
use pretty_assertions;
use std::env;
@ -95,7 +96,8 @@ test_ptx!(b64tof64, [111u64], [111u64]);
// This segfaults NV compiler
// test_ptx!(implicit_param, [34u32], [34u32]);
test_ptx!(pred_not, [10u64, 11u64], [2u64, 0u64]);
test_ptx!(mad_s32, [2i32, 3i32, 4i32], [10i32, 10i32, 10i32]);
test_ptx!(mad_s32, [2i32, 3i32, 4i32], [10i32]);
test_ptx!(mad_wide, [-1i32, 3, 4, 5], [21474836481i64]);
test_ptx!(
mul_wide,
[0x01_00_00_00__01_00_00_00i64],
@ -313,60 +315,126 @@ fn test_cuda_assert<
output: &mut [Output],
) -> Result<(), Box<dyn error::Error + 'a>> {
let name = CString::new(name)?;
let result =
run_cuda(name.as_c_str(), ptx_text, input, output).map_err(|err| DisplayError { err })?;
let result = run_cuda(name.as_c_str(), ptx_text, input, output);
assert_eq!(result.as_slice(), output);
Ok(())
}
macro_rules! cuda_call {
($expr:expr) => {
#[allow(unused_unsafe)]
{
let err = unsafe { $expr };
if err != cuda_driver_sys::CUresult::CUDA_SUCCESS {
return Result::Err(err);
}
}
};
}
fn run_cuda<Input: From<u8> + Copy + Debug, Output: From<u8> + Copy + Debug + Default>(
name: &CStr,
ptx_module: &str,
input: &[Input],
output: &mut [Output],
) -> Result<Vec<Output>, cuda_driver_sys::CUresult> {
use cuda_driver_sys::*;
cuda_call! { cuInit(0) };
) -> Vec<Output> {
unsafe { CUDA.cuInit(0) }.unwrap().unwrap();
let ptx_module = CString::new(ptx_module).unwrap();
let mut result = vec![0u8.into(); output.len()];
{
let mut ctx = ptr::null_mut();
cuda_call! { cuCtxCreate_v2(&mut ctx, 0, 0) };
let mut module = ptr::null_mut();
cuda_call! { cuModuleLoadData(&mut module, ptx_module.as_ptr() as _) };
let mut kernel = ptr::null_mut();
cuda_call! { cuModuleGetFunction(&mut kernel, module, name.as_ptr()) };
let mut ctx = unsafe { mem::zeroed() };
unsafe { CUDA.cuCtxCreate_v2(&mut ctx, 0, 0) }
.unwrap()
.unwrap();
let mut module = unsafe { mem::zeroed() };
unsafe { CUDA.cuModuleLoadData(&mut module, ptx_module.as_ptr() as _) }
.unwrap()
.unwrap();
let mut kernel = unsafe { mem::zeroed() };
unsafe { CUDA.cuModuleGetFunction(&mut kernel, module, name.as_ptr()) }
.unwrap()
.unwrap();
let mut inp_b = unsafe { mem::zeroed() };
cuda_call! { cuMemAlloc_v2(&mut inp_b, input.len() * mem::size_of::<Input>()) };
unsafe { CUDA.cuMemAlloc_v2(&mut inp_b, input.len() * mem::size_of::<Input>()) }
.unwrap()
.unwrap();
let mut out_b = unsafe { mem::zeroed() };
cuda_call! { cuMemAlloc_v2(&mut out_b, output.len() * mem::size_of::<Output>()) };
cuda_call! { cuMemcpyHtoD_v2(inp_b, input.as_ptr() as _, input.len() * mem::size_of::<Input>()) };
cuda_call! { cuMemsetD8_v2(out_b, 0, output.len() * mem::size_of::<Output>()) };
let mut args = [&inp_b, &out_b];
cuda_call! { cuLaunchKernel(kernel, 1,1,1,1,1,1, 1024, 0 as _, args.as_mut_ptr() as _, ptr::null_mut()) };
cuda_call! { cuMemcpyDtoH_v2(result.as_mut_ptr() as _, out_b, output.len() * mem::size_of::<Output>()) };
cuda_call! { cuStreamSynchronize(0 as _) };
cuda_call! { cuMemFree_v2(inp_b) };
cuda_call! { cuMemFree_v2(out_b) };
cuda_call! { cuModuleUnload(module) };
cuda_call! { cuCtxDestroy_v2(ctx) };
unsafe { CUDA.cuMemAlloc_v2(&mut out_b, output.len() * mem::size_of::<Output>()) }
.unwrap()
.unwrap();
unsafe {
CUDA.cuMemcpyHtoD_v2(
inp_b,
input.as_ptr() as _,
input.len() * mem::size_of::<Input>(),
)
}
Ok(result)
.unwrap()
.unwrap();
unsafe { CUDA.cuMemsetD8_v2(out_b, 0, output.len() * mem::size_of::<Output>()) }
.unwrap()
.unwrap();
let mut args = [&inp_b, &out_b];
unsafe {
CUDA.cuLaunchKernel(
kernel,
1,
1,
1,
1,
1,
1,
1024,
CUstream(ptr::null_mut()),
args.as_mut_ptr() as _,
ptr::null_mut(),
)
}
.unwrap()
.unwrap();
unsafe {
CUDA.cuMemcpyDtoH_v2(
result.as_mut_ptr() as _,
out_b,
output.len() * mem::size_of::<Output>(),
)
}
.unwrap()
.unwrap();
unsafe { CUDA.cuStreamSynchronize(CUstream(ptr::null_mut())) }
.unwrap()
.unwrap();
unsafe { CUDA.cuMemFree_v2(inp_b) }.unwrap().unwrap();
unsafe { CUDA.cuMemFree_v2(out_b) }.unwrap().unwrap();
unsafe { CUDA.cuModuleUnload(module) }.unwrap().unwrap();
unsafe { CUDA.cuCtxDestroy_v2(ctx) }.unwrap().unwrap();
}
result
}
struct DynamicCuda {
lib: libloading::Library,
}
impl DynamicCuda {
#[cfg(not(windows))]
const CUDA_PATH: &'static str = "/usr/lib/x86_64-linux-gnu/libcuda.so.1";
#[cfg(windows)]
const CUDA_PATH: &'static str = "C:\\Windows\\System32\\nvcuda.dll";
pub fn new() -> Result<Self, libloading::Error> {
let lib = unsafe { libloading::Library::new(Self::CUDA_PATH) }?;
Ok(Self { lib })
}
}
macro_rules! dynamic_fns {
($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:ty;)*) => {
impl DynamicCuda {
$(
#[allow(dead_code)]
unsafe fn $fn_name(&self, $($arg_id : $arg_type),*) -> Result<$ret_type, libloading::Error> {
let func = unsafe { self.lib.get::<unsafe extern "system" fn ($($arg_type),*) -> $ret_type>(concat!(stringify!($fn_name), "\0").as_bytes()) };
func.map(|f| f($($arg_id),*) )
}
)*
}
};
}
cuda_base::cuda_function_declarations!(dynamic_fns);
static COMGR: std::sync::LazyLock<Comgr> = std::sync::LazyLock::new(|| Comgr::new().unwrap());
static CUDA: std::sync::LazyLock<DynamicCuda> =
std::sync::LazyLock::new(|| DynamicCuda::new().unwrap());
fn run_hip<Input: From<u8> + Copy + Debug, Output: From<u8> + Copy + Debug + Default>(
name: &CStr,

View File

@ -251,7 +251,10 @@ ptx_parser_macros::generate_instruction_type!(
},
src1: T,
src2: T,
src3: T,
src3: {
repr: T,
type: { Type::from(data.dst_type()) },
}
}
},
Max {

View File

@ -42,3 +42,6 @@ pub(crate) fn set_d32_v2(dst: hipDeviceptr_t, ui: ::core::ffi::c_uint, n: usize)
pub(crate) fn set_d8_v2(dst: hipDeviceptr_t, value: ::core::ffi::c_uchar, n: usize) -> hipError_t {
unsafe { hipMemsetD8(dst, value, n) }
}
pub(crate) fn get_info_v2(free: *mut usize, total: *mut usize) -> hipError_t {
unsafe { hipMemGetInfo(free, total) }
}

View File

@ -9,6 +9,7 @@ pub(super) mod function;
pub(super) mod memory;
pub(super) mod module;
pub(super) mod pointer;
pub(super) mod stream;
#[cfg(debug_assertions)]
pub(crate) fn unimplemented() -> CUresult {

5
zluda/src/impl/stream.rs Normal file
View File

@ -0,0 +1,5 @@
use hip_runtime_sys::*;
pub(crate) fn synchronize(stream: hipStream_t) -> hipError_t {
unsafe { hipStreamSynchronize(stream) }
}

View File

@ -54,26 +54,28 @@ cuda_base::cuda_function_declarations!(
cuDeviceGetCount,
cuDeviceGetLuid,
cuDeviceGetName,
cuDevicePrimaryCtxRelease,
cuDevicePrimaryCtxRetain,
cuDeviceGetProperties,
cuDeviceGetUuid,
cuDeviceGetUuid_v2,
cuDevicePrimaryCtxRelease,
cuDevicePrimaryCtxRetain,
cuDeviceTotalMem_v2,
cuDriverGetVersion,
cuFuncGetAttribute,
cuInit,
cuMemAlloc_v2,
cuMemFree_v2,
cuMemGetAddressRange_v2,
cuMemGetInfo_v2,
cuMemcpyDtoH_v2,
cuMemcpyHtoD_v2,
cuMemsetD32_v2,
cuMemsetD8_v2,
cuModuleGetFunction,
cuModuleLoadData,
cuModuleUnload,
cuPointerGetAttribute,
cuMemGetAddressRange_v2,
cuMemsetD32_v2,
cuMemsetD8_v2
cuStreamSynchronize
],
implemented_in_function <= [
cuLaunchKernel,