diff --git a/ptx/src/test/ll/fmax.ll b/ptx/src/test/ll/fmax.ll index d50ae8d..ba640cf 100644 --- a/ptx/src/test/ll/fmax.ll +++ b/ptx/src/test/ll/fmax.ll @@ -1,51 +1,45 @@ -define amdgpu_kernel void @add_s32_sat(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i32, align 4, addrspace(5) - %"42" = alloca i32, align 4, addrspace(5) - %"43" = alloca i32, align 4, addrspace(5) - %"44" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @fmax(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca half, align 2, addrspace(5) + %"40" = alloca half, align 2, addrspace(5) + %"41" = alloca half, align 2, addrspace(5) + %"42" = alloca half, align 2, addrspace(5) br label %1 1: ; preds = %0 - br label %"36" + br label %"34" -"36": ; preds = %1 - %"45" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"45", ptr addrspace(5) %"39", align 4 - %"46" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"46", ptr addrspace(5) %"40", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %"61" = inttoptr i64 %"48" to ptr - %"47" = load i32, ptr %"61", align 4 - store i32 %"47", ptr addrspace(5) %"41", align 4 - %"49" = load i64, ptr addrspace(5) %"39", align 4 - %"62" = inttoptr i64 %"49" to ptr - %"33" = getelementptr inbounds i8, ptr %"62", i64 4 - %"50" = load i32, ptr %"33", align 4 - store i32 %"50", ptr addrspace(5) %"42", align 4 - %"52" = load i32, ptr addrspace(5) %"41", align 4 - %"53" = load i32, ptr addrspace(5) %"42", align 4 - %"51" = call i32 @llvm.sadd.sat.i32(i32 %"52", i32 %"53") - store i32 %"51", ptr addrspace(5) %"43", align 4 - %"55" = load i32, ptr addrspace(5) %"41", align 4 - %"56" = load i32, ptr addrspace(5) %"42", align 4 - %"54" = add i32 %"55", %"56" - store i32 %"54", ptr addrspace(5) %"44", align 4 - %"57" = load i64, ptr addrspace(5) %"40", align 4 - %"58" = load i32, ptr addrspace(5) %"43", align 4 - %"63" = inttoptr i64 %"57" to ptr - store i32 %"58", ptr %"63", align 4 - %"59" = load i64, ptr addrspace(5) %"40", align 4 - %"64" = inttoptr i64 %"59" to ptr - %"35" = getelementptr inbounds i8, ptr %"64", i64 4 - %"60" = load i32, ptr addrspace(5) %"44", align 4 - store i32 %"60", ptr %"35", align 4 +"34": ; preds = %1 + %"43" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"43", ptr addrspace(5) %"37", align 4 + %"44" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"44", ptr addrspace(5) %"38", align 4 + %"46" = load i64, ptr addrspace(5) %"37", align 4 + %"55" = inttoptr i64 %"46" to ptr + %"54" = load i16, ptr %"55", align 2 + %"45" = bitcast i16 %"54" to half + store half %"45", ptr addrspace(5) %"39", align 2 + %"47" = load i64, ptr addrspace(5) %"37", align 4 + %"56" = inttoptr i64 %"47" to ptr + %"33" = getelementptr inbounds i8, ptr %"56", i64 2 + %"57" = load i16, ptr %"33", align 2 + %"48" = bitcast i16 %"57" to half + store half %"48", ptr addrspace(5) %"40", align 2 + %"50" = load half, ptr addrspace(5) %"40", align 2 + %"51" = load half, ptr addrspace(5) %"39", align 2 + %"49" = call half @llvm.maxnum.f16(half %"50", half %"51") + store half %"49", ptr addrspace(5) %"41", align 2 + %"52" = load i64, ptr addrspace(5) %"38", align 4 + %"53" = load half, ptr addrspace(5) %"41", align 2 + %"58" = inttoptr i64 %"52" to ptr + %"59" = bitcast half %"53" to i16 + store i16 %"59", ptr %"58", align 2 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare i32 @llvm.sadd.sat.i32(i32, i32) #1 +declare half @llvm.maxnum.f16(half, half) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/spirv_run/fmax.ptx b/ptx/src/test/spirv_run/fmax.ptx index 11ddb42..6f7a528 100644 --- a/ptx/src/test/spirv_run/fmax.ptx +++ b/ptx/src/test/spirv_run/fmax.ptx @@ -11,13 +11,15 @@ .reg .u64 out_addr; .reg .f16 temp1; .reg .f16 temp2; + .reg .f16 result1; + .reg .f16 result2; ld.param.u64 in_addr, [input]; ld.param.u64 out_addr, [output]; ld.b16 temp1, [in_addr]; ld.b16 temp2, [in_addr+2]; - max.f16 temp1, temp1, temp2; - st.b16 [out_addr], temp1; + max.f16 result1, temp2, temp1; + st.b16 [out_addr], result1; ret; } diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs index 00bbdf4..1ac10ea 100644 --- a/ptx/src/test/spirv_run/mod.rs +++ b/ptx/src/test/spirv_run/mod.rs @@ -18,7 +18,7 @@ use std::str; macro_rules! read_test_file { ($file:expr) => { { - // CARGO_MANIFEST_DIR is the crate directory (ptx), but file! is relative to the workspace root (and therefore also includes ptx). + // CARGO_MANIFEST_DIR is the crate directory (ptx), but file! is relative to the workspace root (and therefore also includes ptx). let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); path.pop(); path.push(file!()); @@ -175,7 +175,7 @@ test_ptx!(sin, [std::f32::consts::PI / 2f32], [1f32]); test_ptx!(cos, [std::f32::consts::PI], [-1f32]); test_ptx!(lg2, [512f32], [9f32]); test_ptx!(ex2, [10f32], [1024f32]); -test_ptx!(fmax, [0u32, 0x8000u32], [0x8000u32]); +test_ptx!(fmax, [0u16, half::f16::NAN.to_bits()], [0u16]); test_ptx!(cvt_rni, [9.5f32, 10.5f32], [10f32, 10f32]); test_ptx!(cvt_rzi, [-13.8f32, 12.9f32], [-13f32, 12f32]); test_ptx!(cvt_s32_f32, [-13.8f32, 12.9f32], [-13i32, 13i32]);