Implement non-coherent loads and implicit sign-extending conversions

This commit is contained in:
Andrzej Janik
2021-03-03 21:22:31 +01:00
parent cdac38d572
commit efd91e270c
8 changed files with 178 additions and 5 deletions

View File

@ -740,6 +740,7 @@ pub struct LdDetails {
pub state_space: LdStateSpace,
pub caching: LdCacheOperator,
pub typ: LdStType,
pub non_coherent: bool,
}
sub_type! {

View File

@ -93,6 +93,7 @@ match {
".min",
".nan",
".NaN",
".nc",
".ne",
".neu",
".num",
@ -750,13 +751,38 @@ Instruction: ast::Instruction<ast::ParsedArgParams<'input>> = {
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld
InstLd: ast::Instruction<ast::ParsedArgParams<'input>> = {
"ld" <q:LdStQualifier?> <ss:LdStateSpace?> <cop:LdCacheOperator?> <t:LdStType> <dst:DstOperandVec> "," <src:MemoryOperand> => {
"ld" <q:LdStQualifier?> <ss:LdNonGlobalStateSpace?> <cop:LdCacheOperator?> <t:LdStType> <dst:DstOperandVec> "," <src:MemoryOperand> => {
ast::Instruction::Ld(
ast::LdDetails {
qualifier: q.unwrap_or(ast::LdStQualifier::Weak),
state_space: ss.unwrap_or(ast::LdStateSpace::Generic),
caching: cop.unwrap_or(ast::LdCacheOperator::Cached),
typ: t
typ: t,
non_coherent: false
},
ast::Arg2Ld { dst:dst, src:src }
)
},
"ld" <q:LdStQualifier?> ".global" <cop:LdCacheOperator?> <t:LdStType> <dst:DstOperandVec> "," <src:MemoryOperand> => {
ast::Instruction::Ld(
ast::LdDetails {
qualifier: q.unwrap_or(ast::LdStQualifier::Weak),
state_space: ast::LdStateSpace::Global,
caching: cop.unwrap_or(ast::LdCacheOperator::Cached),
typ: t,
non_coherent: false
},
ast::Arg2Ld { dst:dst, src:src }
)
},
"ld" ".global" <cop:LdNcCacheOperator?> ".nc" <t:LdStType> <dst:DstOperandVec> "," <src:MemoryOperand> => {
ast::Instruction::Ld(
ast::LdDetails {
qualifier: ast::LdStQualifier::Weak,
state_space: ast::LdStateSpace::Global,
caching: cop.unwrap_or(ast::LdCacheOperator::Cached),
typ: t,
non_coherent: true
},
ast::Arg2Ld { dst:dst, src:src }
)
@ -781,9 +807,8 @@ MemScope: ast::MemScope = {
".sys" => ast::MemScope::Sys
};
LdStateSpace: ast::LdStateSpace = {
LdNonGlobalStateSpace: ast::LdStateSpace = {
".const" => ast::LdStateSpace::Const,
".global" => ast::LdStateSpace::Global,
".local" => ast::LdStateSpace::Local,
".param" => ast::LdStateSpace::Param,
".shared" => ast::LdStateSpace::Shared,
@ -797,6 +822,12 @@ LdCacheOperator: ast::LdCacheOperator = {
".cv" => ast::LdCacheOperator::Uncached,
};
LdNcCacheOperator: ast::LdCacheOperator = {
".ca" => ast::LdCacheOperator::Cached,
".cg" => ast::LdCacheOperator::L2Only,
".cs" => ast::LdCacheOperator::Streaming,
};
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov
InstMov: ast::Instruction<ast::ParsedArgParams<'input>> = {
"mov" <pref:VectorPrefix?> <t:MovScalarType> <dst:DstOperandVec> "," <src:SrcOperandVec> => {

View File

@ -0,0 +1,22 @@
.version 6.5
.target sm_30
.address_size 64
.visible .entry add_non_coherent(
.param .u64 input,
.param .u64 output
)
{
.reg .u64 in_addr;
.reg .u64 out_addr;
.reg .u64 temp;
.reg .u64 temp2;
ld.param.u64 in_addr, [input];
ld.param.u64 out_addr, [output];
ld.global.nc.u64 temp, [in_addr];
add.u64 temp2, temp, 1;
st.global.u64 [out_addr], temp2;
ret;
}

View File

@ -0,0 +1,47 @@
OpCapability GenericPointer
OpCapability Linkage
OpCapability Addresses
OpCapability Kernel
OpCapability Int8
OpCapability Int16
OpCapability Int64
OpCapability Float16
OpCapability Float64
%23 = OpExtInstImport "OpenCL.std"
OpMemoryModel Physical64 OpenCL
OpEntryPoint Kernel %1 "add_non_coherent"
%void = OpTypeVoid
%ulong = OpTypeInt 64 0
%26 = OpTypeFunction %void %ulong %ulong
%_ptr_Function_ulong = OpTypePointer Function %ulong
%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong
%ulong_1 = OpConstant %ulong 1
%1 = OpFunction %void None %26
%8 = OpFunctionParameter %ulong
%9 = OpFunctionParameter %ulong
%21 = OpLabel
%2 = OpVariable %_ptr_Function_ulong Function
%3 = OpVariable %_ptr_Function_ulong Function
%4 = OpVariable %_ptr_Function_ulong Function
%5 = OpVariable %_ptr_Function_ulong Function
%6 = OpVariable %_ptr_Function_ulong Function
%7 = OpVariable %_ptr_Function_ulong Function
OpStore %2 %8
OpStore %3 %9
%10 = OpLoad %ulong %2 Aligned 8
OpStore %4 %10
%11 = OpLoad %ulong %3 Aligned 8
OpStore %5 %11
%13 = OpLoad %ulong %4
%19 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %13
%12 = OpLoad %ulong %19 Aligned 8
OpStore %6 %12
%15 = OpLoad %ulong %6
%14 = OpIAdd %ulong %15 %ulong_1
OpStore %7 %14
%16 = OpLoad %ulong %5
%17 = OpLoad %ulong %7
%20 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %16
OpStore %20 %17 Aligned 8
OpReturn
OpFunctionEnd

View File

@ -153,6 +153,8 @@ test_ptx!(shared_ptr_take_address, [97815231u64], [97815231u64]);
test_ptx!(assertfail, [716523871u64], [716523872u64]);
test_ptx!(cvt_s64_s32, [-1i32], [-1i64]);
test_ptx!(add_tuning, [2u64], [3u64]);
test_ptx!(add_non_coherent, [3u64], [4u64]);
test_ptx!(sign_extend, [-1i16], [-1i32]);
struct DisplayError<T: Debug> {
err: T,

View File

@ -0,0 +1,20 @@
.version 6.5
.target sm_30
.address_size 64
.visible .entry sign_extend(
.param .u64 input,
.param .u64 output
)
{
.reg .u64 in_addr;
.reg .u64 out_addr;
.reg .s32 temp;
ld.param.u64 in_addr, [input];
ld.param.u64 out_addr, [output];
ld.s16 temp, [in_addr];
st.s32 [out_addr], temp;
ret;
}

View File

@ -0,0 +1,47 @@
OpCapability GenericPointer
OpCapability Linkage
OpCapability Addresses
OpCapability Kernel
OpCapability Int8
OpCapability Int16
OpCapability Int64
OpCapability Float16
OpCapability Float64
%20 = OpExtInstImport "OpenCL.std"
OpMemoryModel Physical64 OpenCL
OpEntryPoint Kernel %1 "sign_extend"
%void = OpTypeVoid
%ulong = OpTypeInt 64 0
%23 = OpTypeFunction %void %ulong %ulong
%_ptr_Function_ulong = OpTypePointer Function %ulong
%uint = OpTypeInt 32 0
%_ptr_Function_uint = OpTypePointer Function %uint
%ushort = OpTypeInt 16 0
%_ptr_Generic_ushort = OpTypePointer Generic %ushort
%_ptr_Generic_uint = OpTypePointer Generic %uint
%1 = OpFunction %void None %23
%7 = OpFunctionParameter %ulong
%8 = OpFunctionParameter %ulong
%18 = OpLabel
%2 = OpVariable %_ptr_Function_ulong Function
%3 = OpVariable %_ptr_Function_ulong Function
%4 = OpVariable %_ptr_Function_ulong Function
%5 = OpVariable %_ptr_Function_ulong Function
%6 = OpVariable %_ptr_Function_uint Function
OpStore %2 %7
OpStore %3 %8
%9 = OpLoad %ulong %2 Aligned 8
OpStore %4 %9
%10 = OpLoad %ulong %3 Aligned 8
OpStore %5 %10
%12 = OpLoad %ulong %4
%16 = OpConvertUToPtr %_ptr_Generic_ushort %12
%15 = OpLoad %ushort %16 Aligned 2
%11 = OpSConvert %uint %15
OpStore %6 %11
%13 = OpLoad %ulong %5
%14 = OpLoad %uint %6
%17 = OpConvertUToPtr %_ptr_Generic_uint %13
OpStore %17 %14 Aligned 4
OpReturn
OpFunctionEnd

View File

@ -4285,7 +4285,10 @@ fn emit_implicit_conversion(
}
}
}
(TypeKind::Scalar, TypeKind::Scalar, ConversionKind::SignExtend) => todo!(),
(TypeKind::Scalar, TypeKind::Scalar, ConversionKind::SignExtend) => {
let result_type = map.get_or_add(builder, SpirvType::from(cv.to.clone()));
builder.s_convert(result_type , Some(cv.dst), cv.src)?;
},
(TypeKind::Vector, TypeKind::Scalar, ConversionKind::Default)
| (TypeKind::Scalar, TypeKind::Array, ConversionKind::Default)
| (TypeKind::Array, TypeKind::Scalar, ConversionKind::Default) => {