Merge f146176b5f393b3eb528e8afdddcda86c0c5386f into ce9c0aac236ec712a33b9a53844a011c6303f3b8

This commit is contained in:
Violet
2025-08-02 06:43:55 +00:00
committed by GitHub
18 changed files with 580 additions and 2 deletions

View File

@ -173,6 +173,7 @@ fn run_instruction<'input>(
| ast::Instruction::Setp { .. }
| ast::Instruction::SetpBool { .. }
| ast::Instruction::ShflSync { .. }
| ast::Instruction::Shf { .. }
| ast::Instruction::Shl { .. }
| ast::Instruction::Shr { .. }
| ast::Instruction::Sin { .. }

View File

@ -1823,6 +1823,7 @@ fn get_modes<T: ast::Operand>(inst: &ast::Instruction<T>) -> InstructionModes {
| ast::Instruction::CpAsyncCommitGroup { .. }
| ast::Instruction::CpAsyncWaitGroup { .. }
| ast::Instruction::CpAsyncWaitAll { .. }
| ast::Instruction::Shf { .. }
| ast::Instruction::Shl { .. }
| ast::Instruction::Selp { .. }
| ast::Instruction::Ret { .. }

View File

@ -34,7 +34,7 @@ use crate::pass::*;
use llvm_zluda::{core::*, *};
use llvm_zluda::{prelude::*, LLVMZludaBuildAtomicRMW};
use llvm_zluda::{LLVMCallConv, LLVMZludaBuildAlloca};
use ptx_parser::{CpAsyncArgs, CpAsyncDetails, Mul24Control};
use ptx_parser::{CpAsyncArgs, CpAsyncDetails, FunnelShiftMode, Mul24Control, ShfArgs};
struct Builder(LLVMBuilderRef);
@ -485,6 +485,7 @@ impl<'a> MethodEmitContext<'a> {
ast::Instruction::Bra { arguments } => self.emit_bra(arguments),
ast::Instruction::Call { data, arguments } => self.emit_call(data, arguments),
ast::Instruction::Cvt { data, arguments } => self.emit_cvt(data, arguments),
ast::Instruction::Shf { data, arguments } => self.emit_shf(data, arguments),
ast::Instruction::Shr { data, arguments } => self.emit_shr(data, arguments),
ast::Instruction::Shl { data, arguments } => self.emit_shl(data, arguments),
ast::Instruction::Ret { data } => Ok(self.emit_ret(data)),
@ -1990,6 +1991,61 @@ impl<'a> MethodEmitContext<'a> {
Ok(())
}
fn emit_shf(
&mut self,
data: ptx_parser::ShfDetails,
arguments: ShfArgs<SpirvWord>,
) -> Result<(), TranslateError> {
let lsb = self.resolver.value(arguments.src_a)?;
let msb = self.resolver.value(arguments.src_b)?;
let shift_amount = self.resolver.value(arguments.src_c)?;
let llvm_i32 = get_scalar_type(self.context, ast::ScalarType::B32);
let const_32 = unsafe { LLVMConstInt(llvm_i32, 32, 0) };
let intrinsic = match data.direction {
ptx_parser::ShiftDirection::R => c"llvm.fshr.i32",
ptx_parser::ShiftDirection::L => c"llvm.fshl.i32",
};
let shifted = self.emit_intrinsic(
intrinsic,
None,
Some(&ast::Type::Scalar(ptx_parser::ScalarType::B32)),
vec![(msb, llvm_i32), (lsb, llvm_i32), (shift_amount, llvm_i32)],
)?;
if data.mode == FunnelShiftMode::Clamp {
// `llvm.fsh*` acts like `shf.*.wrap`. To implement clamp, we must conditionally return
// the left or right-most 32 bits if `shift_amount` is greater than or equal to 32.
let should_clamp = unsafe {
LLVMBuildICmp(
self.builder,
LLVMIntPredicate::LLVMIntUGE,
shift_amount,
const_32,
LLVM_UNNAMED.as_ptr(),
)
};
let max_shift = match data.direction {
ptx_parser::ShiftDirection::R => msb,
ptx_parser::ShiftDirection::L => lsb,
};
self.resolver.with_result(arguments.dst, |dst| unsafe {
LLVMBuildSelect(self.builder, should_clamp, max_shift, shifted, dst)
});
} else {
let name = self.resolver.get_or_add(arguments.dst);
unsafe { LLVMSetValueName2(shifted, name.as_ptr().cast(), name.len()) };
self.resolver.register(arguments.dst, shifted);
}
Ok(())
}
fn emit_shr(
&mut self,
data: ptx_parser::ShrData,

50
ptx/src/test/ll/shf_l.ll Normal file
View File

@ -0,0 +1,50 @@
define amdgpu_kernel void @shf_l(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
%"39" = alloca i64, align 8, addrspace(5)
%"40" = alloca i64, align 8, addrspace(5)
%"41" = alloca i32, align 4, addrspace(5)
%"42" = alloca i32, align 4, addrspace(5)
%"43" = alloca i32, align 4, addrspace(5)
%"44" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"36"
"36": ; preds = %1
%"45" = load i64, ptr addrspace(4) %"37", align 8
store i64 %"45", ptr addrspace(5) %"39", align 8
%"46" = load i64, ptr addrspace(4) %"38", align 8
store i64 %"46", ptr addrspace(5) %"40", align 8
%"48" = load i64, ptr addrspace(5) %"39", align 8
%"59" = inttoptr i64 %"48" to ptr
%"47" = load i32, ptr %"59", align 4
store i32 %"47", ptr addrspace(5) %"41", align 4
%"49" = load i64, ptr addrspace(5) %"39", align 8
%"60" = inttoptr i64 %"49" to ptr
%"33" = getelementptr inbounds i8, ptr %"60", i64 4
%"50" = load i32, ptr %"33", align 4
store i32 %"50", ptr addrspace(5) %"42", align 4
%"51" = load i64, ptr addrspace(5) %"39", align 8
%"61" = inttoptr i64 %"51" to ptr
%"35" = getelementptr inbounds i8, ptr %"61", i64 8
%"52" = load i32, ptr %"35", align 4
store i32 %"52", ptr addrspace(5) %"43", align 4
%"54" = load i32, ptr addrspace(5) %"41", align 4
%"55" = load i32, ptr addrspace(5) %"42", align 4
%"56" = load i32, ptr addrspace(5) %"43", align 4
%2 = call i32 @llvm.fshl.i32(i32 %"55", i32 %"54", i32 %"56")
%3 = icmp uge i32 %"56", 32
%"62" = select i1 %3, i32 %"54", i32 %2
store i32 %"62", ptr addrspace(5) %"44", align 4
%"57" = load i64, ptr addrspace(5) %"40", align 8
%"58" = load i32, ptr addrspace(5) %"44", align 4
%"63" = inttoptr i64 %"57" to ptr
store i32 %"58", ptr %"63", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.fshl.i32(i32, i32, i32) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

View File

@ -0,0 +1,50 @@
define amdgpu_kernel void @shf_l_clamp(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
%"39" = alloca i64, align 8, addrspace(5)
%"40" = alloca i64, align 8, addrspace(5)
%"41" = alloca i32, align 4, addrspace(5)
%"42" = alloca i32, align 4, addrspace(5)
%"43" = alloca i32, align 4, addrspace(5)
%"44" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"36"
"36": ; preds = %1
%"45" = load i64, ptr addrspace(4) %"37", align 8
store i64 %"45", ptr addrspace(5) %"39", align 8
%"46" = load i64, ptr addrspace(4) %"38", align 8
store i64 %"46", ptr addrspace(5) %"40", align 8
%"48" = load i64, ptr addrspace(5) %"39", align 8
%"59" = inttoptr i64 %"48" to ptr
%"47" = load i32, ptr %"59", align 4
store i32 %"47", ptr addrspace(5) %"41", align 4
%"49" = load i64, ptr addrspace(5) %"39", align 8
%"60" = inttoptr i64 %"49" to ptr
%"33" = getelementptr inbounds i8, ptr %"60", i64 4
%"50" = load i32, ptr %"33", align 4
store i32 %"50", ptr addrspace(5) %"42", align 4
%"51" = load i64, ptr addrspace(5) %"39", align 8
%"61" = inttoptr i64 %"51" to ptr
%"35" = getelementptr inbounds i8, ptr %"61", i64 8
%"52" = load i32, ptr %"35", align 4
store i32 %"52", ptr addrspace(5) %"43", align 4
%"54" = load i32, ptr addrspace(5) %"41", align 4
%"55" = load i32, ptr addrspace(5) %"42", align 4
%"56" = load i32, ptr addrspace(5) %"43", align 4
%2 = call i32 @llvm.fshl.i32(i32 %"55", i32 %"54", i32 %"56")
%3 = icmp uge i32 %"56", 32
%"62" = select i1 %3, i32 %"54", i32 %2
store i32 %"62", ptr addrspace(5) %"44", align 4
%"57" = load i64, ptr addrspace(5) %"40", align 8
%"58" = load i32, ptr addrspace(5) %"44", align 4
%"63" = inttoptr i64 %"57" to ptr
store i32 %"58", ptr %"63", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.fshl.i32(i32, i32, i32) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

View File

@ -0,0 +1,48 @@
define amdgpu_kernel void @shf_l_wrap(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
%"39" = alloca i64, align 8, addrspace(5)
%"40" = alloca i64, align 8, addrspace(5)
%"41" = alloca i32, align 4, addrspace(5)
%"42" = alloca i32, align 4, addrspace(5)
%"43" = alloca i32, align 4, addrspace(5)
%"44" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"36"
"36": ; preds = %1
%"45" = load i64, ptr addrspace(4) %"37", align 8
store i64 %"45", ptr addrspace(5) %"39", align 8
%"46" = load i64, ptr addrspace(4) %"38", align 8
store i64 %"46", ptr addrspace(5) %"40", align 8
%"48" = load i64, ptr addrspace(5) %"39", align 8
%"59" = inttoptr i64 %"48" to ptr
%"47" = load i32, ptr %"59", align 4
store i32 %"47", ptr addrspace(5) %"41", align 4
%"49" = load i64, ptr addrspace(5) %"39", align 8
%"60" = inttoptr i64 %"49" to ptr
%"33" = getelementptr inbounds i8, ptr %"60", i64 4
%"50" = load i32, ptr %"33", align 4
store i32 %"50", ptr addrspace(5) %"42", align 4
%"51" = load i64, ptr addrspace(5) %"39", align 8
%"61" = inttoptr i64 %"51" to ptr
%"35" = getelementptr inbounds i8, ptr %"61", i64 8
%"52" = load i32, ptr %"35", align 4
store i32 %"52", ptr addrspace(5) %"43", align 4
%"54" = load i32, ptr addrspace(5) %"41", align 4
%"55" = load i32, ptr addrspace(5) %"42", align 4
%"56" = load i32, ptr addrspace(5) %"43", align 4
%"62" = call i32 @llvm.fshl.i32(i32 %"55", i32 %"54", i32 %"56")
store i32 %"62", ptr addrspace(5) %"44", align 4
%"57" = load i64, ptr addrspace(5) %"40", align 8
%"58" = load i32, ptr addrspace(5) %"44", align 4
%"63" = inttoptr i64 %"57" to ptr
store i32 %"58", ptr %"63", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.fshl.i32(i32, i32, i32) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

50
ptx/src/test/ll/shf_r.ll Normal file
View File

@ -0,0 +1,50 @@
define amdgpu_kernel void @shf_r(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
%"39" = alloca i64, align 8, addrspace(5)
%"40" = alloca i64, align 8, addrspace(5)
%"41" = alloca i32, align 4, addrspace(5)
%"42" = alloca i32, align 4, addrspace(5)
%"43" = alloca i32, align 4, addrspace(5)
%"44" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"36"
"36": ; preds = %1
%"45" = load i64, ptr addrspace(4) %"37", align 8
store i64 %"45", ptr addrspace(5) %"39", align 8
%"46" = load i64, ptr addrspace(4) %"38", align 8
store i64 %"46", ptr addrspace(5) %"40", align 8
%"48" = load i64, ptr addrspace(5) %"39", align 8
%"59" = inttoptr i64 %"48" to ptr
%"47" = load i32, ptr %"59", align 4
store i32 %"47", ptr addrspace(5) %"41", align 4
%"49" = load i64, ptr addrspace(5) %"39", align 8
%"60" = inttoptr i64 %"49" to ptr
%"33" = getelementptr inbounds i8, ptr %"60", i64 4
%"50" = load i32, ptr %"33", align 4
store i32 %"50", ptr addrspace(5) %"42", align 4
%"51" = load i64, ptr addrspace(5) %"39", align 8
%"61" = inttoptr i64 %"51" to ptr
%"35" = getelementptr inbounds i8, ptr %"61", i64 8
%"52" = load i32, ptr %"35", align 4
store i32 %"52", ptr addrspace(5) %"43", align 4
%"54" = load i32, ptr addrspace(5) %"41", align 4
%"55" = load i32, ptr addrspace(5) %"42", align 4
%"56" = load i32, ptr addrspace(5) %"43", align 4
%2 = call i32 @llvm.fshr.i32(i32 %"55", i32 %"54", i32 %"56")
%3 = icmp uge i32 %"56", 32
%"62" = select i1 %3, i32 %"55", i32 %2
store i32 %"62", ptr addrspace(5) %"44", align 4
%"57" = load i64, ptr addrspace(5) %"40", align 8
%"58" = load i32, ptr addrspace(5) %"44", align 4
%"63" = inttoptr i64 %"57" to ptr
store i32 %"58", ptr %"63", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.fshr.i32(i32, i32, i32) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

View File

@ -0,0 +1,50 @@
define amdgpu_kernel void @shf_r_clamp(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
%"39" = alloca i64, align 8, addrspace(5)
%"40" = alloca i64, align 8, addrspace(5)
%"41" = alloca i32, align 4, addrspace(5)
%"42" = alloca i32, align 4, addrspace(5)
%"43" = alloca i32, align 4, addrspace(5)
%"44" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"36"
"36": ; preds = %1
%"45" = load i64, ptr addrspace(4) %"37", align 8
store i64 %"45", ptr addrspace(5) %"39", align 8
%"46" = load i64, ptr addrspace(4) %"38", align 8
store i64 %"46", ptr addrspace(5) %"40", align 8
%"48" = load i64, ptr addrspace(5) %"39", align 8
%"59" = inttoptr i64 %"48" to ptr
%"47" = load i32, ptr %"59", align 4
store i32 %"47", ptr addrspace(5) %"41", align 4
%"49" = load i64, ptr addrspace(5) %"39", align 8
%"60" = inttoptr i64 %"49" to ptr
%"33" = getelementptr inbounds i8, ptr %"60", i64 4
%"50" = load i32, ptr %"33", align 4
store i32 %"50", ptr addrspace(5) %"42", align 4
%"51" = load i64, ptr addrspace(5) %"39", align 8
%"61" = inttoptr i64 %"51" to ptr
%"35" = getelementptr inbounds i8, ptr %"61", i64 8
%"52" = load i32, ptr %"35", align 4
store i32 %"52", ptr addrspace(5) %"43", align 4
%"54" = load i32, ptr addrspace(5) %"41", align 4
%"55" = load i32, ptr addrspace(5) %"42", align 4
%"56" = load i32, ptr addrspace(5) %"43", align 4
%2 = call i32 @llvm.fshr.i32(i32 %"55", i32 %"54", i32 %"56")
%3 = icmp uge i32 %"56", 32
%"62" = select i1 %3, i32 %"55", i32 %2
store i32 %"62", ptr addrspace(5) %"44", align 4
%"57" = load i64, ptr addrspace(5) %"40", align 8
%"58" = load i32, ptr addrspace(5) %"44", align 4
%"63" = inttoptr i64 %"57" to ptr
store i32 %"58", ptr %"63", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.fshr.i32(i32, i32, i32) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

View File

@ -0,0 +1,48 @@
define amdgpu_kernel void @shf_r_wrap(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
%"39" = alloca i64, align 8, addrspace(5)
%"40" = alloca i64, align 8, addrspace(5)
%"41" = alloca i32, align 4, addrspace(5)
%"42" = alloca i32, align 4, addrspace(5)
%"43" = alloca i32, align 4, addrspace(5)
%"44" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"36"
"36": ; preds = %1
%"45" = load i64, ptr addrspace(4) %"37", align 8
store i64 %"45", ptr addrspace(5) %"39", align 8
%"46" = load i64, ptr addrspace(4) %"38", align 8
store i64 %"46", ptr addrspace(5) %"40", align 8
%"48" = load i64, ptr addrspace(5) %"39", align 8
%"59" = inttoptr i64 %"48" to ptr
%"47" = load i32, ptr %"59", align 4
store i32 %"47", ptr addrspace(5) %"41", align 4
%"49" = load i64, ptr addrspace(5) %"39", align 8
%"60" = inttoptr i64 %"49" to ptr
%"33" = getelementptr inbounds i8, ptr %"60", i64 4
%"50" = load i32, ptr %"33", align 4
store i32 %"50", ptr addrspace(5) %"42", align 4
%"51" = load i64, ptr addrspace(5) %"39", align 8
%"61" = inttoptr i64 %"51" to ptr
%"35" = getelementptr inbounds i8, ptr %"61", i64 8
%"52" = load i32, ptr %"35", align 4
store i32 %"52", ptr addrspace(5) %"43", align 4
%"54" = load i32, ptr addrspace(5) %"41", align 4
%"55" = load i32, ptr addrspace(5) %"42", align 4
%"56" = load i32, ptr addrspace(5) %"43", align 4
%"62" = call i32 @llvm.fshr.i32(i32 %"55", i32 %"54", i32 %"56")
store i32 %"62", ptr addrspace(5) %"44", align 4
%"57" = load i64, ptr addrspace(5) %"40", align 8
%"58" = load i32, ptr addrspace(5) %"44", align 4
%"63" = inttoptr i64 %"57" to ptr
store i32 %"58", ptr %"63", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.fshr.i32(i32, i32, i32) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

View File

@ -302,6 +302,28 @@ test_ptx!(tanh, [f32::INFINITY], [1.0f32]);
test_ptx!(cp_async, [0u32], [1u32, 2u32, 3u32, 0u32]);
test_ptx!(nanosleep, [0u64], [0u64]);
test_ptx!(shf_l, [0x12345678u32, 0x9abcdef0u32, 12], [0xcdef0123u32]);
test_ptx!(shf_r, [0x12345678u32, 0x9abcdef0u32, 12], [0xef012345u32]);
test_ptx!(
shf_l_clamp,
[0x12345678u32, 0x9abcdef0u32, 44],
[0x12345678u32]
);
test_ptx!(
shf_r_clamp,
[0x12345678u32, 0x9abcdef0u32, 44],
[0x9abcdef0u32]
);
test_ptx!(
shf_l_wrap,
[0x12345678u32, 0x9abcdef0u32, 44],
[0xcdef0123u32]
);
test_ptx!(
shf_r_wrap,
[0x12345678u32, 0x9abcdef0u32, 44],
[0xef012345u32]
);
test_ptx!(assertfail);
// TODO: not yet supported

View File

@ -0,0 +1,28 @@
.version 6.5
.target sm_32
.address_size 64
.visible .entry shf_l(
.param .u64 input,
.param .u64 output
)
{
.reg .u64 in_addr;
.reg .u64 out_addr;
.reg .b32 in_a;
.reg .b32 in_b;
.reg .b32 in_c;
.reg .u32 result;
ld.param.u64 in_addr, [input];
ld.param.u64 out_addr, [output];
ld.b32 in_a, [in_addr];
ld.b32 in_b, [in_addr+4];
ld.b32 in_c, [in_addr+8];
shf.l.clamp.b32 result, in_a, in_b, in_c;
st.b32 [out_addr], result;
ret;
}

View File

@ -0,0 +1,28 @@
.version 6.5
.target sm_32
.address_size 64
.visible .entry shf_l_clamp(
.param .u64 input,
.param .u64 output
)
{
.reg .u64 in_addr;
.reg .u64 out_addr;
.reg .b32 in_a;
.reg .b32 in_b;
.reg .b32 in_c;
.reg .u32 result;
ld.param.u64 in_addr, [input];
ld.param.u64 out_addr, [output];
ld.b32 in_a, [in_addr];
ld.b32 in_b, [in_addr+4];
ld.b32 in_c, [in_addr+8];
shf.l.clamp.b32 result, in_a, in_b, in_c;
st.b32 [out_addr], result;
ret;
}

View File

@ -0,0 +1,28 @@
.version 6.5
.target sm_32
.address_size 64
.visible .entry shf_l_wrap(
.param .u64 input,
.param .u64 output
)
{
.reg .u64 in_addr;
.reg .u64 out_addr;
.reg .b32 in_a;
.reg .b32 in_b;
.reg .b32 in_c;
.reg .u32 result;
ld.param.u64 in_addr, [input];
ld.param.u64 out_addr, [output];
ld.b32 in_a, [in_addr];
ld.b32 in_b, [in_addr+4];
ld.b32 in_c, [in_addr+8];
shf.l.wrap.b32 result, in_a, in_b, in_c;
st.b32 [out_addr], result;
ret;
}

View File

@ -0,0 +1,28 @@
.version 6.5
.target sm_32
.address_size 64
.visible .entry shf_r(
.param .u64 input,
.param .u64 output
)
{
.reg .u64 in_addr;
.reg .u64 out_addr;
.reg .b32 in_a;
.reg .b32 in_b;
.reg .b32 in_c;
.reg .u32 result;
ld.param.u64 in_addr, [input];
ld.param.u64 out_addr, [output];
ld.b32 in_a, [in_addr];
ld.b32 in_b, [in_addr+4];
ld.b32 in_c, [in_addr+8];
shf.r.clamp.b32 result, in_a, in_b, in_c;
st.b32 [out_addr], result;
ret;
}

View File

@ -0,0 +1,28 @@
.version 6.5
.target sm_32
.address_size 64
.visible .entry shf_r_clamp(
.param .u64 input,
.param .u64 output
)
{
.reg .u64 in_addr;
.reg .u64 out_addr;
.reg .b32 in_a;
.reg .b32 in_b;
.reg .b32 in_c;
.reg .u32 result;
ld.param.u64 in_addr, [input];
ld.param.u64 out_addr, [output];
ld.b32 in_a, [in_addr];
ld.b32 in_b, [in_addr+4];
ld.b32 in_c, [in_addr+8];
shf.r.clamp.b32 result, in_a, in_b, in_c;
st.b32 [out_addr], result;
ret;
}

View File

@ -0,0 +1,28 @@
.version 6.5
.target sm_32
.address_size 64
.visible .entry shf_r_wrap(
.param .u64 input,
.param .u64 output
)
{
.reg .u64 in_addr;
.reg .u64 out_addr;
.reg .b32 in_a;
.reg .b32 in_b;
.reg .b32 in_c;
.reg .u32 result;
ld.param.u64 in_addr, [input];
ld.param.u64 out_addr, [output];
ld.b32 in_a, [in_addr];
ld.b32 in_b, [in_addr+4];
ld.b32 in_c, [in_addr+8];
shf.r.wrap.b32 result, in_a, in_b, in_c;
st.b32 [out_addr], result;
ret;
}

View File

@ -2,7 +2,9 @@ use super::{
AtomSemantics, MemScope, RawRoundingMode, RawSetpCompareOp, ScalarType, SetpBoolPostOp,
StateSpace, VectorPrefix,
};
use crate::{Mul24Control, PtxError, PtxParserState, Reduction, ShuffleMode};
use crate::{
FunnelShiftMode, Mul24Control, PtxError, PtxParserState, Reduction, ShiftDirection, ShuffleMode,
};
use bitflags::bitflags;
use std::{alloc::Layout, cmp::Ordering, num::NonZeroU8};
@ -549,6 +551,16 @@ ptx_parser_macros::generate_instruction_type!(
src_membermask: T
}
},
Shf {
data: ShfDetails,
type: Type::Scalar(ScalarType::B32),
arguments<T>: {
dst: T,
src_a: T,
src_b: T,
src_c: T
}
},
Shl {
data: ScalarType,
type: { Type::Scalar(data.clone()) },
@ -1103,6 +1115,11 @@ pub struct CpAsyncDetails {
pub src_size: Option<u64>,
}
pub struct ShfDetails {
pub direction: ShiftDirection,
pub mode: FunnelShiftMode,
}
#[derive(Clone)]
pub enum ParsedOperand<Ident> {
Reg(Ident),

View File

@ -1739,6 +1739,12 @@ derive_parser!(
#[derive(Copy, Clone, PartialEq, Eq, Hash)]
pub enum ShuffleMode { }
#[derive(Copy, Clone, PartialEq, Eq, Hash)]
pub enum ShiftDirection { }
#[derive(Copy, Clone, PartialEq, Eq, Hash)]
pub enum FunnelShiftMode { }
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov
mov{.vec}.type d, a => {
Instruction::Mov {
@ -3625,6 +3631,17 @@ derive_parser!(
cp.async.wait_all => {
Instruction::CpAsyncWaitAll {}
}
// https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-shf
shf.dir.mode.b32 d, a, b, c => {
Instruction::Shf {
data: ShfDetails { direction: dir, mode: mode },
arguments: ShfArgs { dst: d, src_a: a, src_b: b, src_c: c }
}
}
.dir: ShiftDirection = { .l, .r };
.mode: FunnelShiftMode = { .clamp, .wrap };
);
#[cfg(test)]