mirror of
https://github.com/vosen/ZLUDA.git
synced 2025-08-02 14:57:43 +03:00
Fix mad.wide, replace external CUDA library in test with our own (#376)
This commit is contained in:
22
Cargo.lock
generated
22
Cargo.lock
generated
@ -270,24 +270,6 @@ dependencies = [
|
|||||||
"unicode-segmentation",
|
"unicode-segmentation",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "cuda-config"
|
|
||||||
version = "0.1.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "4ee74643f7430213a1a78320f88649de309b20b80818325575e393f848f79f5d"
|
|
||||||
dependencies = [
|
|
||||||
"glob",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "cuda-driver-sys"
|
|
||||||
version = "0.3.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "1d4c552cc0de854877d80bcd1f11db75d42be32962d72a6799b88dcca88fffbd"
|
|
||||||
dependencies = [
|
|
||||||
"cuda-config",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cuda_base"
|
name = "cuda_base"
|
||||||
version = "0.0.0"
|
version = "0.0.0"
|
||||||
@ -938,10 +920,12 @@ dependencies = [
|
|||||||
"bit-vec 0.6.3",
|
"bit-vec 0.6.3",
|
||||||
"bitflags 1.3.2",
|
"bitflags 1.3.2",
|
||||||
"comgr",
|
"comgr",
|
||||||
"cuda-driver-sys",
|
"cuda_base",
|
||||||
|
"cuda_types",
|
||||||
"half",
|
"half",
|
||||||
"hip_runtime-sys",
|
"hip_runtime-sys",
|
||||||
"int-enum",
|
"int-enum",
|
||||||
|
"libloading",
|
||||||
"llvm_zluda",
|
"llvm_zluda",
|
||||||
"microlp",
|
"microlp",
|
||||||
"paste",
|
"paste",
|
||||||
|
@ -199,7 +199,7 @@ impl VisitMut for FixFnSignatures {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const MODULES: &[&str] = &[
|
const MODULES: &[&str] = &[
|
||||||
"context", "device", "driver", "function", "link", "memory", "module", "pointer",
|
"context", "device", "driver", "function", "link", "memory", "module", "pointer", "stream",
|
||||||
];
|
];
|
||||||
|
|
||||||
#[proc_macro]
|
#[proc_macro]
|
||||||
|
@ -25,7 +25,9 @@ unwrap_or = "1.0.1"
|
|||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
hip_runtime-sys = { path = "../ext/hip_runtime-sys" }
|
hip_runtime-sys = { path = "../ext/hip_runtime-sys" }
|
||||||
comgr = { path = "../comgr" }
|
comgr = { path = "../comgr" }
|
||||||
|
cuda_types = { path = "../cuda_types" }
|
||||||
|
cuda_base = { path = "../cuda_base" }
|
||||||
tempfile = "3"
|
tempfile = "3"
|
||||||
paste = "1.0"
|
paste = "1.0"
|
||||||
cuda-driver-sys = "0.3.0"
|
|
||||||
pretty_assertions = "1.4.1"
|
pretty_assertions = "1.4.1"
|
||||||
|
libloading = "0.8"
|
||||||
|
@ -1,54 +1,44 @@
|
|||||||
define amdgpu_kernel void @mad_s32(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 {
|
define amdgpu_kernel void @mad_s32(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
|
||||||
%"43" = alloca i64, align 8, addrspace(5)
|
%"39" = alloca i64, align 8, addrspace(5)
|
||||||
%"44" = alloca i64, align 8, addrspace(5)
|
%"40" = alloca i64, align 8, addrspace(5)
|
||||||
%"45" = alloca i32, align 4, addrspace(5)
|
%"41" = alloca i32, align 4, addrspace(5)
|
||||||
%"46" = alloca i32, align 4, addrspace(5)
|
%"42" = alloca i32, align 4, addrspace(5)
|
||||||
%"47" = alloca i32, align 4, addrspace(5)
|
%"43" = alloca i32, align 4, addrspace(5)
|
||||||
%"48" = alloca i32, align 4, addrspace(5)
|
%"44" = alloca i32, align 4, addrspace(5)
|
||||||
br label %1
|
br label %1
|
||||||
|
|
||||||
1: ; preds = %0
|
1: ; preds = %0
|
||||||
br label %"40"
|
br label %"36"
|
||||||
|
|
||||||
"40": ; preds = %1
|
"36": ; preds = %1
|
||||||
%"49" = load i64, ptr addrspace(4) %"41", align 4
|
%"45" = load i64, ptr addrspace(4) %"37", align 4
|
||||||
store i64 %"49", ptr addrspace(5) %"43", align 4
|
store i64 %"45", ptr addrspace(5) %"39", align 4
|
||||||
%"50" = load i64, ptr addrspace(4) %"42", align 4
|
%"46" = load i64, ptr addrspace(4) %"38", align 4
|
||||||
store i64 %"50", ptr addrspace(5) %"44", align 4
|
store i64 %"46", ptr addrspace(5) %"40", align 4
|
||||||
%"52" = load i64, ptr addrspace(5) %"43", align 4
|
%"48" = load i64, ptr addrspace(5) %"39", align 4
|
||||||
%"67" = inttoptr i64 %"52" to ptr
|
%"59" = inttoptr i64 %"48" to ptr
|
||||||
%"51" = load i32, ptr %"67", align 4
|
%"47" = load i32, ptr %"59", align 4
|
||||||
store i32 %"51", ptr addrspace(5) %"46", align 4
|
store i32 %"47", ptr addrspace(5) %"42", align 4
|
||||||
%"53" = load i64, ptr addrspace(5) %"43", align 4
|
%"49" = load i64, ptr addrspace(5) %"39", align 4
|
||||||
%"68" = inttoptr i64 %"53" to ptr
|
%"60" = inttoptr i64 %"49" to ptr
|
||||||
%"33" = getelementptr inbounds i8, ptr %"68", i64 4
|
%"33" = getelementptr inbounds i8, ptr %"60", i64 4
|
||||||
%"54" = load i32, ptr %"33", align 4
|
%"50" = load i32, ptr %"33", align 4
|
||||||
store i32 %"54", ptr addrspace(5) %"47", align 4
|
store i32 %"50", ptr addrspace(5) %"43", align 4
|
||||||
%"55" = load i64, ptr addrspace(5) %"43", align 4
|
%"51" = load i64, ptr addrspace(5) %"39", align 4
|
||||||
%"69" = inttoptr i64 %"55" to ptr
|
%"61" = inttoptr i64 %"51" to ptr
|
||||||
%"35" = getelementptr inbounds i8, ptr %"69", i64 8
|
%"35" = getelementptr inbounds i8, ptr %"61", i64 8
|
||||||
%"56" = load i32, ptr %"35", align 4
|
%"52" = load i32, ptr %"35", align 4
|
||||||
store i32 %"56", ptr addrspace(5) %"48", align 4
|
store i32 %"52", ptr addrspace(5) %"44", align 4
|
||||||
%"58" = load i32, ptr addrspace(5) %"46", align 4
|
%"54" = load i32, ptr addrspace(5) %"42", align 4
|
||||||
%"59" = load i32, ptr addrspace(5) %"47", align 4
|
%"55" = load i32, ptr addrspace(5) %"43", align 4
|
||||||
%"60" = load i32, ptr addrspace(5) %"48", align 4
|
%"56" = load i32, ptr addrspace(5) %"44", align 4
|
||||||
%2 = mul i32 %"58", %"59"
|
%2 = mul i32 %"54", %"55"
|
||||||
%"57" = add i32 %2, %"60"
|
%"53" = add i32 %2, %"56"
|
||||||
store i32 %"57", ptr addrspace(5) %"45", align 4
|
store i32 %"53", ptr addrspace(5) %"41", align 4
|
||||||
%"61" = load i64, ptr addrspace(5) %"44", align 4
|
%"57" = load i64, ptr addrspace(5) %"40", align 4
|
||||||
%"62" = load i32, ptr addrspace(5) %"45", align 4
|
%"58" = load i32, ptr addrspace(5) %"41", align 4
|
||||||
%"70" = inttoptr i64 %"61" to ptr
|
%"62" = inttoptr i64 %"57" to ptr
|
||||||
store i32 %"62", ptr %"70", align 4
|
store i32 %"58", ptr %"62", align 4
|
||||||
%"63" = load i64, ptr addrspace(5) %"44", align 4
|
|
||||||
%"71" = inttoptr i64 %"63" to ptr
|
|
||||||
%"37" = getelementptr inbounds i8, ptr %"71", i64 4
|
|
||||||
%"64" = load i32, ptr addrspace(5) %"45", align 4
|
|
||||||
store i32 %"64", ptr %"37", align 4
|
|
||||||
%"65" = load i64, ptr addrspace(5) %"44", align 4
|
|
||||||
%"72" = inttoptr i64 %"65" to ptr
|
|
||||||
%"39" = getelementptr inbounds i8, ptr %"72", i64 8
|
|
||||||
%"66" = load i32, ptr addrspace(5) %"45", align 4
|
|
||||||
store i32 %"66", ptr %"39", align 4
|
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
47
ptx/src/test/ll/mad_wide.ll
Normal file
47
ptx/src/test/ll/mad_wide.ll
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
define amdgpu_kernel void @mad_wide(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
|
||||||
|
%"39" = alloca i64, align 8, addrspace(5)
|
||||||
|
%"40" = alloca i64, align 8, addrspace(5)
|
||||||
|
%"41" = alloca i64, align 8, addrspace(5)
|
||||||
|
%"42" = alloca i32, align 4, addrspace(5)
|
||||||
|
%"43" = alloca i32, align 4, addrspace(5)
|
||||||
|
%"44" = alloca i64, align 8, addrspace(5)
|
||||||
|
br label %1
|
||||||
|
|
||||||
|
1: ; preds = %0
|
||||||
|
br label %"36"
|
||||||
|
|
||||||
|
"36": ; preds = %1
|
||||||
|
%"45" = load i64, ptr addrspace(4) %"37", align 4
|
||||||
|
store i64 %"45", ptr addrspace(5) %"39", align 4
|
||||||
|
%"46" = load i64, ptr addrspace(4) %"38", align 4
|
||||||
|
store i64 %"46", ptr addrspace(5) %"40", align 4
|
||||||
|
%"48" = load i64, ptr addrspace(5) %"39", align 4
|
||||||
|
%"59" = inttoptr i64 %"48" to ptr
|
||||||
|
%"47" = load i32, ptr %"59", align 4
|
||||||
|
store i32 %"47", ptr addrspace(5) %"42", align 4
|
||||||
|
%"49" = load i64, ptr addrspace(5) %"39", align 4
|
||||||
|
%"60" = inttoptr i64 %"49" to ptr
|
||||||
|
%"33" = getelementptr inbounds i8, ptr %"60", i64 4
|
||||||
|
%"50" = load i32, ptr %"33", align 4
|
||||||
|
store i32 %"50", ptr addrspace(5) %"43", align 4
|
||||||
|
%"51" = load i64, ptr addrspace(5) %"39", align 4
|
||||||
|
%"61" = inttoptr i64 %"51" to ptr
|
||||||
|
%"35" = getelementptr inbounds i8, ptr %"61", i64 8
|
||||||
|
%"52" = load i64, ptr %"35", align 4
|
||||||
|
store i64 %"52", ptr addrspace(5) %"44", align 4
|
||||||
|
%"54" = load i32, ptr addrspace(5) %"42", align 4
|
||||||
|
%"55" = load i32, ptr addrspace(5) %"43", align 4
|
||||||
|
%"56" = load i64, ptr addrspace(5) %"44", align 4
|
||||||
|
%2 = sext i32 %"54" to i64
|
||||||
|
%3 = sext i32 %"55" to i64
|
||||||
|
%4 = mul i64 %2, %3
|
||||||
|
%"53" = add i64 %4, %"56"
|
||||||
|
store i64 %"53", ptr addrspace(5) %"41", align 4
|
||||||
|
%"57" = load i64, ptr addrspace(5) %"40", align 4
|
||||||
|
%"58" = load i64, ptr addrspace(5) %"41", align 4
|
||||||
|
%"62" = inttoptr i64 %"57" to ptr
|
||||||
|
store i64 %"58", ptr %"62", align 4
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -22,7 +22,5 @@
|
|||||||
ld.s32 src3, [in_addr+8];
|
ld.s32 src3, [in_addr+8];
|
||||||
mad.lo.s32 dst, src1, src2, src3;
|
mad.lo.s32 dst, src1, src2, src3;
|
||||||
st.s32 [out_addr], dst;
|
st.s32 [out_addr], dst;
|
||||||
st.s32 [out_addr+4], dst;
|
|
||||||
st.s32 [out_addr+8], dst;
|
|
||||||
ret;
|
ret;
|
||||||
}
|
}
|
||||||
|
27
ptx/src/test/spirv_run/mad_wide.ptx
Normal file
27
ptx/src/test/spirv_run/mad_wide.ptx
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
.version 6.5
|
||||||
|
.target sm_30
|
||||||
|
.address_size 64
|
||||||
|
|
||||||
|
.visible .entry mad_wide(
|
||||||
|
.param .u64 input,
|
||||||
|
.param .u64 output
|
||||||
|
)
|
||||||
|
{
|
||||||
|
.reg .u64 in_addr;
|
||||||
|
.reg .u64 out_addr;
|
||||||
|
|
||||||
|
.reg .s64 dst;
|
||||||
|
.reg .s32 src1;
|
||||||
|
.reg .s32 src2;
|
||||||
|
.reg .s64 src3;
|
||||||
|
|
||||||
|
ld.param.u64 in_addr, [input];
|
||||||
|
ld.param.u64 out_addr, [output];
|
||||||
|
|
||||||
|
ld.s32 src1, [in_addr];
|
||||||
|
ld.s32 src2, [in_addr+4];
|
||||||
|
ld.s64 src3, [in_addr+8];
|
||||||
|
mad.wide.s32 dst, src1, src2, src3;
|
||||||
|
st.s64 [out_addr], dst;
|
||||||
|
ret;
|
||||||
|
}
|
@ -1,5 +1,6 @@
|
|||||||
use crate::pass;
|
use crate::pass;
|
||||||
use comgr::Comgr;
|
use comgr::Comgr;
|
||||||
|
use cuda_types::cuda::CUstream;
|
||||||
use hip_runtime_sys::hipError_t;
|
use hip_runtime_sys::hipError_t;
|
||||||
use pretty_assertions;
|
use pretty_assertions;
|
||||||
use std::env;
|
use std::env;
|
||||||
@ -95,7 +96,8 @@ test_ptx!(b64tof64, [111u64], [111u64]);
|
|||||||
// This segfaults NV compiler
|
// This segfaults NV compiler
|
||||||
// test_ptx!(implicit_param, [34u32], [34u32]);
|
// test_ptx!(implicit_param, [34u32], [34u32]);
|
||||||
test_ptx!(pred_not, [10u64, 11u64], [2u64, 0u64]);
|
test_ptx!(pred_not, [10u64, 11u64], [2u64, 0u64]);
|
||||||
test_ptx!(mad_s32, [2i32, 3i32, 4i32], [10i32, 10i32, 10i32]);
|
test_ptx!(mad_s32, [2i32, 3i32, 4i32], [10i32]);
|
||||||
|
test_ptx!(mad_wide, [-1i32, 3, 4, 5], [21474836481i64]);
|
||||||
test_ptx!(
|
test_ptx!(
|
||||||
mul_wide,
|
mul_wide,
|
||||||
[0x01_00_00_00__01_00_00_00i64],
|
[0x01_00_00_00__01_00_00_00i64],
|
||||||
@ -313,60 +315,126 @@ fn test_cuda_assert<
|
|||||||
output: &mut [Output],
|
output: &mut [Output],
|
||||||
) -> Result<(), Box<dyn error::Error + 'a>> {
|
) -> Result<(), Box<dyn error::Error + 'a>> {
|
||||||
let name = CString::new(name)?;
|
let name = CString::new(name)?;
|
||||||
let result =
|
let result = run_cuda(name.as_c_str(), ptx_text, input, output);
|
||||||
run_cuda(name.as_c_str(), ptx_text, input, output).map_err(|err| DisplayError { err })?;
|
|
||||||
assert_eq!(result.as_slice(), output);
|
assert_eq!(result.as_slice(), output);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
macro_rules! cuda_call {
|
|
||||||
($expr:expr) => {
|
|
||||||
#[allow(unused_unsafe)]
|
|
||||||
{
|
|
||||||
let err = unsafe { $expr };
|
|
||||||
if err != cuda_driver_sys::CUresult::CUDA_SUCCESS {
|
|
||||||
return Result::Err(err);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
fn run_cuda<Input: From<u8> + Copy + Debug, Output: From<u8> + Copy + Debug + Default>(
|
fn run_cuda<Input: From<u8> + Copy + Debug, Output: From<u8> + Copy + Debug + Default>(
|
||||||
name: &CStr,
|
name: &CStr,
|
||||||
ptx_module: &str,
|
ptx_module: &str,
|
||||||
input: &[Input],
|
input: &[Input],
|
||||||
output: &mut [Output],
|
output: &mut [Output],
|
||||||
) -> Result<Vec<Output>, cuda_driver_sys::CUresult> {
|
) -> Vec<Output> {
|
||||||
use cuda_driver_sys::*;
|
unsafe { CUDA.cuInit(0) }.unwrap().unwrap();
|
||||||
cuda_call! { cuInit(0) };
|
|
||||||
let ptx_module = CString::new(ptx_module).unwrap();
|
let ptx_module = CString::new(ptx_module).unwrap();
|
||||||
let mut result = vec![0u8.into(); output.len()];
|
let mut result = vec![0u8.into(); output.len()];
|
||||||
{
|
{
|
||||||
let mut ctx = ptr::null_mut();
|
let mut ctx = unsafe { mem::zeroed() };
|
||||||
cuda_call! { cuCtxCreate_v2(&mut ctx, 0, 0) };
|
unsafe { CUDA.cuCtxCreate_v2(&mut ctx, 0, 0) }
|
||||||
let mut module = ptr::null_mut();
|
.unwrap()
|
||||||
cuda_call! { cuModuleLoadData(&mut module, ptx_module.as_ptr() as _) };
|
.unwrap();
|
||||||
let mut kernel = ptr::null_mut();
|
let mut module = unsafe { mem::zeroed() };
|
||||||
cuda_call! { cuModuleGetFunction(&mut kernel, module, name.as_ptr()) };
|
unsafe { CUDA.cuModuleLoadData(&mut module, ptx_module.as_ptr() as _) }
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
let mut kernel = unsafe { mem::zeroed() };
|
||||||
|
unsafe { CUDA.cuModuleGetFunction(&mut kernel, module, name.as_ptr()) }
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
let mut inp_b = unsafe { mem::zeroed() };
|
let mut inp_b = unsafe { mem::zeroed() };
|
||||||
cuda_call! { cuMemAlloc_v2(&mut inp_b, input.len() * mem::size_of::<Input>()) };
|
unsafe { CUDA.cuMemAlloc_v2(&mut inp_b, input.len() * mem::size_of::<Input>()) }
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
let mut out_b = unsafe { mem::zeroed() };
|
let mut out_b = unsafe { mem::zeroed() };
|
||||||
cuda_call! { cuMemAlloc_v2(&mut out_b, output.len() * mem::size_of::<Output>()) };
|
unsafe { CUDA.cuMemAlloc_v2(&mut out_b, output.len() * mem::size_of::<Output>()) }
|
||||||
cuda_call! { cuMemcpyHtoD_v2(inp_b, input.as_ptr() as _, input.len() * mem::size_of::<Input>()) };
|
.unwrap()
|
||||||
cuda_call! { cuMemsetD8_v2(out_b, 0, output.len() * mem::size_of::<Output>()) };
|
.unwrap();
|
||||||
|
unsafe {
|
||||||
|
CUDA.cuMemcpyHtoD_v2(
|
||||||
|
inp_b,
|
||||||
|
input.as_ptr() as _,
|
||||||
|
input.len() * mem::size_of::<Input>(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
unsafe { CUDA.cuMemsetD8_v2(out_b, 0, output.len() * mem::size_of::<Output>()) }
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
let mut args = [&inp_b, &out_b];
|
let mut args = [&inp_b, &out_b];
|
||||||
cuda_call! { cuLaunchKernel(kernel, 1,1,1,1,1,1, 1024, 0 as _, args.as_mut_ptr() as _, ptr::null_mut()) };
|
unsafe {
|
||||||
cuda_call! { cuMemcpyDtoH_v2(result.as_mut_ptr() as _, out_b, output.len() * mem::size_of::<Output>()) };
|
CUDA.cuLaunchKernel(
|
||||||
cuda_call! { cuStreamSynchronize(0 as _) };
|
kernel,
|
||||||
cuda_call! { cuMemFree_v2(inp_b) };
|
1,
|
||||||
cuda_call! { cuMemFree_v2(out_b) };
|
1,
|
||||||
cuda_call! { cuModuleUnload(module) };
|
1,
|
||||||
cuda_call! { cuCtxDestroy_v2(ctx) };
|
1,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
1024,
|
||||||
|
CUstream(ptr::null_mut()),
|
||||||
|
args.as_mut_ptr() as _,
|
||||||
|
ptr::null_mut(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
unsafe {
|
||||||
|
CUDA.cuMemcpyDtoH_v2(
|
||||||
|
result.as_mut_ptr() as _,
|
||||||
|
out_b,
|
||||||
|
output.len() * mem::size_of::<Output>(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
unsafe { CUDA.cuStreamSynchronize(CUstream(ptr::null_mut())) }
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
unsafe { CUDA.cuMemFree_v2(inp_b) }.unwrap().unwrap();
|
||||||
|
unsafe { CUDA.cuMemFree_v2(out_b) }.unwrap().unwrap();
|
||||||
|
unsafe { CUDA.cuModuleUnload(module) }.unwrap().unwrap();
|
||||||
|
unsafe { CUDA.cuCtxDestroy_v2(ctx) }.unwrap().unwrap();
|
||||||
}
|
}
|
||||||
Ok(result)
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct DynamicCuda {
|
||||||
|
lib: libloading::Library,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DynamicCuda {
|
||||||
|
#[cfg(not(windows))]
|
||||||
|
const CUDA_PATH: &'static str = "/usr/lib/x86_64-linux-gnu/libcuda.so.1";
|
||||||
|
#[cfg(windows)]
|
||||||
|
const CUDA_PATH: &'static str = "C:\\Windows\\System32\\nvcuda.dll";
|
||||||
|
|
||||||
|
pub fn new() -> Result<Self, libloading::Error> {
|
||||||
|
let lib = unsafe { libloading::Library::new(Self::CUDA_PATH) }?;
|
||||||
|
Ok(Self { lib })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
macro_rules! dynamic_fns {
|
||||||
|
($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:ty;)*) => {
|
||||||
|
impl DynamicCuda {
|
||||||
|
$(
|
||||||
|
#[allow(dead_code)]
|
||||||
|
unsafe fn $fn_name(&self, $($arg_id : $arg_type),*) -> Result<$ret_type, libloading::Error> {
|
||||||
|
let func = unsafe { self.lib.get::<unsafe extern "system" fn ($($arg_type),*) -> $ret_type>(concat!(stringify!($fn_name), "\0").as_bytes()) };
|
||||||
|
func.map(|f| f($($arg_id),*) )
|
||||||
|
}
|
||||||
|
)*
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
cuda_base::cuda_function_declarations!(dynamic_fns);
|
||||||
|
|
||||||
static COMGR: std::sync::LazyLock<Comgr> = std::sync::LazyLock::new(|| Comgr::new().unwrap());
|
static COMGR: std::sync::LazyLock<Comgr> = std::sync::LazyLock::new(|| Comgr::new().unwrap());
|
||||||
|
static CUDA: std::sync::LazyLock<DynamicCuda> =
|
||||||
|
std::sync::LazyLock::new(|| DynamicCuda::new().unwrap());
|
||||||
|
|
||||||
fn run_hip<Input: From<u8> + Copy + Debug, Output: From<u8> + Copy + Debug + Default>(
|
fn run_hip<Input: From<u8> + Copy + Debug, Output: From<u8> + Copy + Debug + Default>(
|
||||||
name: &CStr,
|
name: &CStr,
|
||||||
|
@ -251,7 +251,10 @@ ptx_parser_macros::generate_instruction_type!(
|
|||||||
},
|
},
|
||||||
src1: T,
|
src1: T,
|
||||||
src2: T,
|
src2: T,
|
||||||
src3: T,
|
src3: {
|
||||||
|
repr: T,
|
||||||
|
type: { Type::from(data.dst_type()) },
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
Max {
|
Max {
|
||||||
|
@ -42,3 +42,6 @@ pub(crate) fn set_d32_v2(dst: hipDeviceptr_t, ui: ::core::ffi::c_uint, n: usize)
|
|||||||
pub(crate) fn set_d8_v2(dst: hipDeviceptr_t, value: ::core::ffi::c_uchar, n: usize) -> hipError_t {
|
pub(crate) fn set_d8_v2(dst: hipDeviceptr_t, value: ::core::ffi::c_uchar, n: usize) -> hipError_t {
|
||||||
unsafe { hipMemsetD8(dst, value, n) }
|
unsafe { hipMemsetD8(dst, value, n) }
|
||||||
}
|
}
|
||||||
|
pub(crate) fn get_info_v2(free: *mut usize, total: *mut usize) -> hipError_t {
|
||||||
|
unsafe { hipMemGetInfo(free, total) }
|
||||||
|
}
|
||||||
|
@ -9,6 +9,7 @@ pub(super) mod function;
|
|||||||
pub(super) mod memory;
|
pub(super) mod memory;
|
||||||
pub(super) mod module;
|
pub(super) mod module;
|
||||||
pub(super) mod pointer;
|
pub(super) mod pointer;
|
||||||
|
pub(super) mod stream;
|
||||||
|
|
||||||
#[cfg(debug_assertions)]
|
#[cfg(debug_assertions)]
|
||||||
pub(crate) fn unimplemented() -> CUresult {
|
pub(crate) fn unimplemented() -> CUresult {
|
||||||
|
5
zluda/src/impl/stream.rs
Normal file
5
zluda/src/impl/stream.rs
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
use hip_runtime_sys::*;
|
||||||
|
|
||||||
|
pub(crate) fn synchronize(stream: hipStream_t) -> hipError_t {
|
||||||
|
unsafe { hipStreamSynchronize(stream) }
|
||||||
|
}
|
@ -54,26 +54,28 @@ cuda_base::cuda_function_declarations!(
|
|||||||
cuDeviceGetCount,
|
cuDeviceGetCount,
|
||||||
cuDeviceGetLuid,
|
cuDeviceGetLuid,
|
||||||
cuDeviceGetName,
|
cuDeviceGetName,
|
||||||
cuDevicePrimaryCtxRelease,
|
|
||||||
cuDevicePrimaryCtxRetain,
|
|
||||||
cuDeviceGetProperties,
|
cuDeviceGetProperties,
|
||||||
cuDeviceGetUuid,
|
cuDeviceGetUuid,
|
||||||
cuDeviceGetUuid_v2,
|
cuDeviceGetUuid_v2,
|
||||||
|
cuDevicePrimaryCtxRelease,
|
||||||
|
cuDevicePrimaryCtxRetain,
|
||||||
cuDeviceTotalMem_v2,
|
cuDeviceTotalMem_v2,
|
||||||
cuDriverGetVersion,
|
cuDriverGetVersion,
|
||||||
cuFuncGetAttribute,
|
cuFuncGetAttribute,
|
||||||
cuInit,
|
cuInit,
|
||||||
cuMemAlloc_v2,
|
cuMemAlloc_v2,
|
||||||
cuMemFree_v2,
|
cuMemFree_v2,
|
||||||
|
cuMemGetAddressRange_v2,
|
||||||
|
cuMemGetInfo_v2,
|
||||||
cuMemcpyDtoH_v2,
|
cuMemcpyDtoH_v2,
|
||||||
cuMemcpyHtoD_v2,
|
cuMemcpyHtoD_v2,
|
||||||
|
cuMemsetD32_v2,
|
||||||
|
cuMemsetD8_v2,
|
||||||
cuModuleGetFunction,
|
cuModuleGetFunction,
|
||||||
cuModuleLoadData,
|
cuModuleLoadData,
|
||||||
cuModuleUnload,
|
cuModuleUnload,
|
||||||
cuPointerGetAttribute,
|
cuPointerGetAttribute,
|
||||||
cuMemGetAddressRange_v2,
|
cuStreamSynchronize
|
||||||
cuMemsetD32_v2,
|
|
||||||
cuMemsetD8_v2
|
|
||||||
],
|
],
|
||||||
implemented_in_function <= [
|
implemented_in_function <= [
|
||||||
cuLaunchKernel,
|
cuLaunchKernel,
|
||||||
|
Reference in New Issue
Block a user