Merge f146176b5f393b3eb528e8afdddcda86c0c5386f into ce9c0aac236ec712a33b9a53844a011c6303f3b8

2025-08-02 14:57:43 +03:00 · 2025-08-02 06:43:55 +00:00
parent ce9c0aac23 f146176b5f
commit 9080747d6a
18 changed files with 580 additions and 2 deletions
--- a/ptx/src/pass/insert_post_saturation.rs
+++ b/ptx/src/pass/insert_post_saturation.rs
@ -173,6 +173,7 @@ fn run_instruction<'input>(
        | ast::Instruction::Setp { .. }
        | ast::Instruction::SetpBool { .. }
        | ast::Instruction::ShflSync { .. }
+        | ast::Instruction::Shf { .. }
        | ast::Instruction::Shl { .. }
        | ast::Instruction::Shr { .. }
        | ast::Instruction::Sin { .. }
--- a/ptx/src/pass/instruction_mode_to_global_mode/mod.rs
+++ b/ptx/src/pass/instruction_mode_to_global_mode/mod.rs
@ -1823,6 +1823,7 @@ fn get_modes<T: ast::Operand>(inst: &ast::Instruction<T>) -> InstructionModes {
        | ast::Instruction::CpAsyncCommitGroup { .. }
        | ast::Instruction::CpAsyncWaitGroup { .. }
        | ast::Instruction::CpAsyncWaitAll { .. }
+        | ast::Instruction::Shf { .. }
        | ast::Instruction::Shl { .. }
        | ast::Instruction::Selp { .. }
        | ast::Instruction::Ret { .. }
--- a/ptx/src/pass/llvm/emit.rs
+++ b/ptx/src/pass/llvm/emit.rs
@ -34,7 +34,7 @@ use crate::pass::*;
 use llvm_zluda::{core::*, *};
 use llvm_zluda::{prelude::*, LLVMZludaBuildAtomicRMW};
 use llvm_zluda::{LLVMCallConv, LLVMZludaBuildAlloca};
-use ptx_parser::{CpAsyncArgs, CpAsyncDetails, Mul24Control};
+use ptx_parser::{CpAsyncArgs, CpAsyncDetails, FunnelShiftMode, Mul24Control, ShfArgs};

 struct Builder(LLVMBuilderRef);

@ -485,6 +485,7 @@ impl<'a> MethodEmitContext<'a> {
            ast::Instruction::Bra { arguments } => self.emit_bra(arguments),
            ast::Instruction::Call { data, arguments } => self.emit_call(data, arguments),
            ast::Instruction::Cvt { data, arguments } => self.emit_cvt(data, arguments),
+            ast::Instruction::Shf { data, arguments } => self.emit_shf(data, arguments),
            ast::Instruction::Shr { data, arguments } => self.emit_shr(data, arguments),
            ast::Instruction::Shl { data, arguments } => self.emit_shl(data, arguments),
            ast::Instruction::Ret { data } => Ok(self.emit_ret(data)),
@ -1990,6 +1991,61 @@ impl<'a> MethodEmitContext<'a> {
        Ok(())
    }

+    fn emit_shf(
+        &mut self,
+        data: ptx_parser::ShfDetails,
+        arguments: ShfArgs<SpirvWord>,
+    ) -> Result<(), TranslateError> {
+        let lsb = self.resolver.value(arguments.src_a)?;
+        let msb = self.resolver.value(arguments.src_b)?;
+        let shift_amount = self.resolver.value(arguments.src_c)?;
+
+        let llvm_i32 = get_scalar_type(self.context, ast::ScalarType::B32);
+        let const_32 = unsafe { LLVMConstInt(llvm_i32, 32, 0) };
+
+        let intrinsic = match data.direction {
+            ptx_parser::ShiftDirection::R => c"llvm.fshr.i32",
+            ptx_parser::ShiftDirection::L => c"llvm.fshl.i32",
+        };
+
+        let shifted = self.emit_intrinsic(
+            intrinsic,
+            None,
+            Some(&ast::Type::Scalar(ptx_parser::ScalarType::B32)),
+            vec![(msb, llvm_i32), (lsb, llvm_i32), (shift_amount, llvm_i32)],
+        )?;
+
+        if data.mode == FunnelShiftMode::Clamp {
+            // `llvm.fsh*` acts like `shf.*.wrap`. To implement clamp, we must conditionally return
+            // the left or right-most 32 bits if `shift_amount` is greater than or equal to 32.
+
+            let should_clamp = unsafe {
+                LLVMBuildICmp(
+                    self.builder,
+                    LLVMIntPredicate::LLVMIntUGE,
+                    shift_amount,
+                    const_32,
+                    LLVM_UNNAMED.as_ptr(),
+                )
+            };
+
+            let max_shift = match data.direction {
+                ptx_parser::ShiftDirection::R => msb,
+                ptx_parser::ShiftDirection::L => lsb,
+            };
+
+            self.resolver.with_result(arguments.dst, |dst| unsafe {
+                LLVMBuildSelect(self.builder, should_clamp, max_shift, shifted, dst)
+            });
+        } else {
+            let name = self.resolver.get_or_add(arguments.dst);
+            unsafe { LLVMSetValueName2(shifted, name.as_ptr().cast(), name.len()) };
+            self.resolver.register(arguments.dst, shifted);
+        }
+
+        Ok(())
+    }
+
    fn emit_shr(
        &mut self,
        data: ptx_parser::ShrData,
--- a/ptx/src/test/ll/shf_l.ll
+++ b/ptx/src/test/ll/shf_l.ll
@ -0,0 +1,50 @@
+define amdgpu_kernel void @shf_l(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
+  %"39" = alloca i64, align 8, addrspace(5)
+  %"40" = alloca i64, align 8, addrspace(5)
+  %"41" = alloca i32, align 4, addrspace(5)
+  %"42" = alloca i32, align 4, addrspace(5)
+  %"43" = alloca i32, align 4, addrspace(5)
+  %"44" = alloca i32, align 4, addrspace(5)
+  br label %1
+
+1:                                                ; preds = %0
+  br label %"36"
+
+"36":                                             ; preds = %1
+  %"45" = load i64, ptr addrspace(4) %"37", align 8
+  store i64 %"45", ptr addrspace(5) %"39", align 8
+  %"46" = load i64, ptr addrspace(4) %"38", align 8
+  store i64 %"46", ptr addrspace(5) %"40", align 8
+  %"48" = load i64, ptr addrspace(5) %"39", align 8
+  %"59" = inttoptr i64 %"48" to ptr
+  %"47" = load i32, ptr %"59", align 4
+  store i32 %"47", ptr addrspace(5) %"41", align 4
+  %"49" = load i64, ptr addrspace(5) %"39", align 8
+  %"60" = inttoptr i64 %"49" to ptr
+  %"33" = getelementptr inbounds i8, ptr %"60", i64 4
+  %"50" = load i32, ptr %"33", align 4
+  store i32 %"50", ptr addrspace(5) %"42", align 4
+  %"51" = load i64, ptr addrspace(5) %"39", align 8
+  %"61" = inttoptr i64 %"51" to ptr
+  %"35" = getelementptr inbounds i8, ptr %"61", i64 8
+  %"52" = load i32, ptr %"35", align 4
+  store i32 %"52", ptr addrspace(5) %"43", align 4
+  %"54" = load i32, ptr addrspace(5) %"41", align 4
+  %"55" = load i32, ptr addrspace(5) %"42", align 4
+  %"56" = load i32, ptr addrspace(5) %"43", align 4
+  %2 = call i32 @llvm.fshl.i32(i32 %"55", i32 %"54", i32 %"56")
+  %3 = icmp uge i32 %"56", 32
+  %"62" = select i1 %3, i32 %"54", i32 %2
+  store i32 %"62", ptr addrspace(5) %"44", align 4
+  %"57" = load i64, ptr addrspace(5) %"40", align 8
+  %"58" = load i32, ptr addrspace(5) %"44", align 4
+  %"63" = inttoptr i64 %"57" to ptr
+  store i32 %"58", ptr %"63", align 4
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.fshl.i32(i32, i32, i32) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
--- a/ptx/src/test/ll/shf_l_clamp.ll
+++ b/ptx/src/test/ll/shf_l_clamp.ll
@ -0,0 +1,50 @@
+define amdgpu_kernel void @shf_l_clamp(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
+  %"39" = alloca i64, align 8, addrspace(5)
+  %"40" = alloca i64, align 8, addrspace(5)
+  %"41" = alloca i32, align 4, addrspace(5)
+  %"42" = alloca i32, align 4, addrspace(5)
+  %"43" = alloca i32, align 4, addrspace(5)
+  %"44" = alloca i32, align 4, addrspace(5)
+  br label %1
+
+1:                                                ; preds = %0
+  br label %"36"
+
+"36":                                             ; preds = %1
+  %"45" = load i64, ptr addrspace(4) %"37", align 8
+  store i64 %"45", ptr addrspace(5) %"39", align 8
+  %"46" = load i64, ptr addrspace(4) %"38", align 8
+  store i64 %"46", ptr addrspace(5) %"40", align 8
+  %"48" = load i64, ptr addrspace(5) %"39", align 8
+  %"59" = inttoptr i64 %"48" to ptr
+  %"47" = load i32, ptr %"59", align 4
+  store i32 %"47", ptr addrspace(5) %"41", align 4
+  %"49" = load i64, ptr addrspace(5) %"39", align 8
+  %"60" = inttoptr i64 %"49" to ptr
+  %"33" = getelementptr inbounds i8, ptr %"60", i64 4
+  %"50" = load i32, ptr %"33", align 4
+  store i32 %"50", ptr addrspace(5) %"42", align 4
+  %"51" = load i64, ptr addrspace(5) %"39", align 8
+  %"61" = inttoptr i64 %"51" to ptr
+  %"35" = getelementptr inbounds i8, ptr %"61", i64 8
+  %"52" = load i32, ptr %"35", align 4
+  store i32 %"52", ptr addrspace(5) %"43", align 4
+  %"54" = load i32, ptr addrspace(5) %"41", align 4
+  %"55" = load i32, ptr addrspace(5) %"42", align 4
+  %"56" = load i32, ptr addrspace(5) %"43", align 4
+  %2 = call i32 @llvm.fshl.i32(i32 %"55", i32 %"54", i32 %"56")
+  %3 = icmp uge i32 %"56", 32
+  %"62" = select i1 %3, i32 %"54", i32 %2
+  store i32 %"62", ptr addrspace(5) %"44", align 4
+  %"57" = load i64, ptr addrspace(5) %"40", align 8
+  %"58" = load i32, ptr addrspace(5) %"44", align 4
+  %"63" = inttoptr i64 %"57" to ptr
+  store i32 %"58", ptr %"63", align 4
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.fshl.i32(i32, i32, i32) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
--- a/ptx/src/test/ll/shf_l_wrap.ll
+++ b/ptx/src/test/ll/shf_l_wrap.ll
@ -0,0 +1,48 @@
+define amdgpu_kernel void @shf_l_wrap(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
+  %"39" = alloca i64, align 8, addrspace(5)
+  %"40" = alloca i64, align 8, addrspace(5)
+  %"41" = alloca i32, align 4, addrspace(5)
+  %"42" = alloca i32, align 4, addrspace(5)
+  %"43" = alloca i32, align 4, addrspace(5)
+  %"44" = alloca i32, align 4, addrspace(5)
+  br label %1
+
+1:                                                ; preds = %0
+  br label %"36"
+
+"36":                                             ; preds = %1
+  %"45" = load i64, ptr addrspace(4) %"37", align 8
+  store i64 %"45", ptr addrspace(5) %"39", align 8
+  %"46" = load i64, ptr addrspace(4) %"38", align 8
+  store i64 %"46", ptr addrspace(5) %"40", align 8
+  %"48" = load i64, ptr addrspace(5) %"39", align 8
+  %"59" = inttoptr i64 %"48" to ptr
+  %"47" = load i32, ptr %"59", align 4
+  store i32 %"47", ptr addrspace(5) %"41", align 4
+  %"49" = load i64, ptr addrspace(5) %"39", align 8
+  %"60" = inttoptr i64 %"49" to ptr
+  %"33" = getelementptr inbounds i8, ptr %"60", i64 4
+  %"50" = load i32, ptr %"33", align 4
+  store i32 %"50", ptr addrspace(5) %"42", align 4
+  %"51" = load i64, ptr addrspace(5) %"39", align 8
+  %"61" = inttoptr i64 %"51" to ptr
+  %"35" = getelementptr inbounds i8, ptr %"61", i64 8
+  %"52" = load i32, ptr %"35", align 4
+  store i32 %"52", ptr addrspace(5) %"43", align 4
+  %"54" = load i32, ptr addrspace(5) %"41", align 4
+  %"55" = load i32, ptr addrspace(5) %"42", align 4
+  %"56" = load i32, ptr addrspace(5) %"43", align 4
+  %"62" = call i32 @llvm.fshl.i32(i32 %"55", i32 %"54", i32 %"56")
+  store i32 %"62", ptr addrspace(5) %"44", align 4
+  %"57" = load i64, ptr addrspace(5) %"40", align 8
+  %"58" = load i32, ptr addrspace(5) %"44", align 4
+  %"63" = inttoptr i64 %"57" to ptr
+  store i32 %"58", ptr %"63", align 4
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.fshl.i32(i32, i32, i32) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
--- a/ptx/src/test/ll/shf_r.ll
+++ b/ptx/src/test/ll/shf_r.ll
@ -0,0 +1,50 @@
+define amdgpu_kernel void @shf_r(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
+  %"39" = alloca i64, align 8, addrspace(5)
+  %"40" = alloca i64, align 8, addrspace(5)
+  %"41" = alloca i32, align 4, addrspace(5)
+  %"42" = alloca i32, align 4, addrspace(5)
+  %"43" = alloca i32, align 4, addrspace(5)
+  %"44" = alloca i32, align 4, addrspace(5)
+  br label %1
+
+1:                                                ; preds = %0
+  br label %"36"
+
+"36":                                             ; preds = %1
+  %"45" = load i64, ptr addrspace(4) %"37", align 8
+  store i64 %"45", ptr addrspace(5) %"39", align 8
+  %"46" = load i64, ptr addrspace(4) %"38", align 8
+  store i64 %"46", ptr addrspace(5) %"40", align 8
+  %"48" = load i64, ptr addrspace(5) %"39", align 8
+  %"59" = inttoptr i64 %"48" to ptr
+  %"47" = load i32, ptr %"59", align 4
+  store i32 %"47", ptr addrspace(5) %"41", align 4
+  %"49" = load i64, ptr addrspace(5) %"39", align 8
+  %"60" = inttoptr i64 %"49" to ptr
+  %"33" = getelementptr inbounds i8, ptr %"60", i64 4
+  %"50" = load i32, ptr %"33", align 4
+  store i32 %"50", ptr addrspace(5) %"42", align 4
+  %"51" = load i64, ptr addrspace(5) %"39", align 8
+  %"61" = inttoptr i64 %"51" to ptr
+  %"35" = getelementptr inbounds i8, ptr %"61", i64 8
+  %"52" = load i32, ptr %"35", align 4
+  store i32 %"52", ptr addrspace(5) %"43", align 4
+  %"54" = load i32, ptr addrspace(5) %"41", align 4
+  %"55" = load i32, ptr addrspace(5) %"42", align 4
+  %"56" = load i32, ptr addrspace(5) %"43", align 4
+  %2 = call i32 @llvm.fshr.i32(i32 %"55", i32 %"54", i32 %"56")
+  %3 = icmp uge i32 %"56", 32
+  %"62" = select i1 %3, i32 %"55", i32 %2
+  store i32 %"62", ptr addrspace(5) %"44", align 4
+  %"57" = load i64, ptr addrspace(5) %"40", align 8
+  %"58" = load i32, ptr addrspace(5) %"44", align 4
+  %"63" = inttoptr i64 %"57" to ptr
+  store i32 %"58", ptr %"63", align 4
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.fshr.i32(i32, i32, i32) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
--- a/ptx/src/test/ll/shf_r_clamp.ll
+++ b/ptx/src/test/ll/shf_r_clamp.ll
@ -0,0 +1,50 @@
+define amdgpu_kernel void @shf_r_clamp(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
+  %"39" = alloca i64, align 8, addrspace(5)
+  %"40" = alloca i64, align 8, addrspace(5)
+  %"41" = alloca i32, align 4, addrspace(5)
+  %"42" = alloca i32, align 4, addrspace(5)
+  %"43" = alloca i32, align 4, addrspace(5)
+  %"44" = alloca i32, align 4, addrspace(5)
+  br label %1
+
+1:                                                ; preds = %0
+  br label %"36"
+
+"36":                                             ; preds = %1
+  %"45" = load i64, ptr addrspace(4) %"37", align 8
+  store i64 %"45", ptr addrspace(5) %"39", align 8
+  %"46" = load i64, ptr addrspace(4) %"38", align 8
+  store i64 %"46", ptr addrspace(5) %"40", align 8
+  %"48" = load i64, ptr addrspace(5) %"39", align 8
+  %"59" = inttoptr i64 %"48" to ptr
+  %"47" = load i32, ptr %"59", align 4
+  store i32 %"47", ptr addrspace(5) %"41", align 4
+  %"49" = load i64, ptr addrspace(5) %"39", align 8
+  %"60" = inttoptr i64 %"49" to ptr
+  %"33" = getelementptr inbounds i8, ptr %"60", i64 4
+  %"50" = load i32, ptr %"33", align 4
+  store i32 %"50", ptr addrspace(5) %"42", align 4
+  %"51" = load i64, ptr addrspace(5) %"39", align 8
+  %"61" = inttoptr i64 %"51" to ptr
+  %"35" = getelementptr inbounds i8, ptr %"61", i64 8
+  %"52" = load i32, ptr %"35", align 4
+  store i32 %"52", ptr addrspace(5) %"43", align 4
+  %"54" = load i32, ptr addrspace(5) %"41", align 4
+  %"55" = load i32, ptr addrspace(5) %"42", align 4
+  %"56" = load i32, ptr addrspace(5) %"43", align 4
+  %2 = call i32 @llvm.fshr.i32(i32 %"55", i32 %"54", i32 %"56")
+  %3 = icmp uge i32 %"56", 32
+  %"62" = select i1 %3, i32 %"55", i32 %2
+  store i32 %"62", ptr addrspace(5) %"44", align 4
+  %"57" = load i64, ptr addrspace(5) %"40", align 8
+  %"58" = load i32, ptr addrspace(5) %"44", align 4
+  %"63" = inttoptr i64 %"57" to ptr
+  store i32 %"58", ptr %"63", align 4
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.fshr.i32(i32, i32, i32) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
--- a/ptx/src/test/ll/shf_r_wrap.ll
+++ b/ptx/src/test/ll/shf_r_wrap.ll
@ -0,0 +1,48 @@
+define amdgpu_kernel void @shf_r_wrap(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
+  %"39" = alloca i64, align 8, addrspace(5)
+  %"40" = alloca i64, align 8, addrspace(5)
+  %"41" = alloca i32, align 4, addrspace(5)
+  %"42" = alloca i32, align 4, addrspace(5)
+  %"43" = alloca i32, align 4, addrspace(5)
+  %"44" = alloca i32, align 4, addrspace(5)
+  br label %1
+
+1:                                                ; preds = %0
+  br label %"36"
+
+"36":                                             ; preds = %1
+  %"45" = load i64, ptr addrspace(4) %"37", align 8
+  store i64 %"45", ptr addrspace(5) %"39", align 8
+  %"46" = load i64, ptr addrspace(4) %"38", align 8
+  store i64 %"46", ptr addrspace(5) %"40", align 8
+  %"48" = load i64, ptr addrspace(5) %"39", align 8
+  %"59" = inttoptr i64 %"48" to ptr
+  %"47" = load i32, ptr %"59", align 4
+  store i32 %"47", ptr addrspace(5) %"41", align 4
+  %"49" = load i64, ptr addrspace(5) %"39", align 8
+  %"60" = inttoptr i64 %"49" to ptr
+  %"33" = getelementptr inbounds i8, ptr %"60", i64 4
+  %"50" = load i32, ptr %"33", align 4
+  store i32 %"50", ptr addrspace(5) %"42", align 4
+  %"51" = load i64, ptr addrspace(5) %"39", align 8
+  %"61" = inttoptr i64 %"51" to ptr
+  %"35" = getelementptr inbounds i8, ptr %"61", i64 8
+  %"52" = load i32, ptr %"35", align 4
+  store i32 %"52", ptr addrspace(5) %"43", align 4
+  %"54" = load i32, ptr addrspace(5) %"41", align 4
+  %"55" = load i32, ptr addrspace(5) %"42", align 4
+  %"56" = load i32, ptr addrspace(5) %"43", align 4
+  %"62" = call i32 @llvm.fshr.i32(i32 %"55", i32 %"54", i32 %"56")
+  store i32 %"62", ptr addrspace(5) %"44", align 4
+  %"57" = load i64, ptr addrspace(5) %"40", align 8
+  %"58" = load i32, ptr addrspace(5) %"44", align 4
+  %"63" = inttoptr i64 %"57" to ptr
+  store i32 %"58", ptr %"63", align 4
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.fshr.i32(i32, i32, i32) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
--- a/ptx/src/test/spirv_run/mod.rs
+++ b/ptx/src/test/spirv_run/mod.rs
@ -302,6 +302,28 @@ test_ptx!(tanh, [f32::INFINITY], [1.0f32]);
 test_ptx!(cp_async, [0u32], [1u32, 2u32, 3u32, 0u32]);

 test_ptx!(nanosleep, [0u64], [0u64]);
+test_ptx!(shf_l, [0x12345678u32, 0x9abcdef0u32, 12], [0xcdef0123u32]);
+test_ptx!(shf_r, [0x12345678u32, 0x9abcdef0u32, 12], [0xef012345u32]);
+test_ptx!(
+    shf_l_clamp,
+    [0x12345678u32, 0x9abcdef0u32, 44],
+    [0x12345678u32]
+);
+test_ptx!(
+    shf_r_clamp,
+    [0x12345678u32, 0x9abcdef0u32, 44],
+    [0x9abcdef0u32]
+);
+test_ptx!(
+    shf_l_wrap,
+    [0x12345678u32, 0x9abcdef0u32, 44],
+    [0xcdef0123u32]
+);
+test_ptx!(
+    shf_r_wrap,
+    [0x12345678u32, 0x9abcdef0u32, 44],
+    [0xef012345u32]
+);

 test_ptx!(assertfail);
 // TODO: not yet supported
--- a/ptx/src/test/spirv_run/shf_l.ptx
+++ b/ptx/src/test/spirv_run/shf_l.ptx
@ -0,0 +1,28 @@
+.version 6.5
+.target sm_32
+.address_size 64
+
+.visible .entry shf_l(
+    .param .u64 input,
+    .param .u64 output
+)
+{
+    .reg .u64       in_addr;
+    .reg .u64       out_addr;
+    .reg .b32       in_a;
+    .reg .b32       in_b;
+    .reg .b32       in_c;
+    .reg .u32       result;
+
+    ld.param.u64    in_addr, [input];
+    ld.param.u64    out_addr, [output];
+
+    ld.b32          in_a, [in_addr];
+    ld.b32          in_b, [in_addr+4];
+    ld.b32          in_c, [in_addr+8];
+
+    shf.l.clamp.b32 result, in_a, in_b, in_c;
+
+    st.b32          [out_addr], result;
+    ret;
+}
--- a/ptx/src/test/spirv_run/shf_l_clamp.ptx
+++ b/ptx/src/test/spirv_run/shf_l_clamp.ptx
@ -0,0 +1,28 @@
+.version 6.5
+.target sm_32
+.address_size 64
+
+.visible .entry shf_l_clamp(
+    .param .u64 input,
+    .param .u64 output
+)
+{
+    .reg .u64       in_addr;
+    .reg .u64       out_addr;
+    .reg .b32       in_a;
+    .reg .b32       in_b;
+    .reg .b32       in_c;
+    .reg .u32       result;
+
+    ld.param.u64    in_addr, [input];
+    ld.param.u64    out_addr, [output];
+
+    ld.b32          in_a, [in_addr];
+    ld.b32          in_b, [in_addr+4];
+    ld.b32          in_c, [in_addr+8];
+
+    shf.l.clamp.b32 result, in_a, in_b, in_c;
+
+    st.b32          [out_addr], result;
+    ret;
+}
--- a/ptx/src/test/spirv_run/shf_l_wrap.ptx
+++ b/ptx/src/test/spirv_run/shf_l_wrap.ptx
@ -0,0 +1,28 @@
+.version 6.5
+.target sm_32
+.address_size 64
+
+.visible .entry shf_l_wrap(
+    .param .u64 input,
+    .param .u64 output
+)
+{
+    .reg .u64       in_addr;
+    .reg .u64       out_addr;
+    .reg .b32       in_a;
+    .reg .b32       in_b;
+    .reg .b32       in_c;
+    .reg .u32       result;
+
+    ld.param.u64    in_addr, [input];
+    ld.param.u64    out_addr, [output];
+
+    ld.b32          in_a, [in_addr];
+    ld.b32          in_b, [in_addr+4];
+    ld.b32          in_c, [in_addr+8];
+
+    shf.l.wrap.b32 result, in_a, in_b, in_c;
+
+    st.b32          [out_addr], result;
+    ret;
+}
--- a/ptx/src/test/spirv_run/shf_r.ptx
+++ b/ptx/src/test/spirv_run/shf_r.ptx
@ -0,0 +1,28 @@
+.version 6.5
+.target sm_32
+.address_size 64
+
+.visible .entry shf_r(
+    .param .u64 input,
+    .param .u64 output
+)
+{
+    .reg .u64       in_addr;
+    .reg .u64       out_addr;
+    .reg .b32       in_a;
+    .reg .b32       in_b;
+    .reg .b32       in_c;
+    .reg .u32       result;
+
+    ld.param.u64    in_addr, [input];
+    ld.param.u64    out_addr, [output];
+
+    ld.b32          in_a, [in_addr];
+    ld.b32          in_b, [in_addr+4];
+    ld.b32          in_c, [in_addr+8];
+
+    shf.r.clamp.b32 result, in_a, in_b, in_c;
+
+    st.b32          [out_addr], result;
+    ret;
+}
--- a/ptx/src/test/spirv_run/shf_r_clamp.ptx
+++ b/ptx/src/test/spirv_run/shf_r_clamp.ptx
@ -0,0 +1,28 @@
+.version 6.5
+.target sm_32
+.address_size 64
+
+.visible .entry shf_r_clamp(
+    .param .u64 input,
+    .param .u64 output
+)
+{
+    .reg .u64       in_addr;
+    .reg .u64       out_addr;
+    .reg .b32       in_a;
+    .reg .b32       in_b;
+    .reg .b32       in_c;
+    .reg .u32       result;
+
+    ld.param.u64    in_addr, [input];
+    ld.param.u64    out_addr, [output];
+
+    ld.b32          in_a, [in_addr];
+    ld.b32          in_b, [in_addr+4];
+    ld.b32          in_c, [in_addr+8];
+
+    shf.r.clamp.b32 result, in_a, in_b, in_c;
+
+    st.b32          [out_addr], result;
+    ret;
+}
--- a/ptx/src/test/spirv_run/shf_r_wrap.ptx
+++ b/ptx/src/test/spirv_run/shf_r_wrap.ptx
@ -0,0 +1,28 @@
+.version 6.5
+.target sm_32
+.address_size 64
+
+.visible .entry shf_r_wrap(
+    .param .u64 input,
+    .param .u64 output
+)
+{
+    .reg .u64       in_addr;
+    .reg .u64       out_addr;
+    .reg .b32       in_a;
+    .reg .b32       in_b;
+    .reg .b32       in_c;
+    .reg .u32       result;
+
+    ld.param.u64    in_addr, [input];
+    ld.param.u64    out_addr, [output];
+
+    ld.b32          in_a, [in_addr];
+    ld.b32          in_b, [in_addr+4];
+    ld.b32          in_c, [in_addr+8];
+
+    shf.r.wrap.b32 result, in_a, in_b, in_c;
+
+    st.b32          [out_addr], result;
+    ret;
+}
--- a/ptx_parser/src/ast.rs
+++ b/ptx_parser/src/ast.rs
@ -2,7 +2,9 @@ use super::{
    AtomSemantics, MemScope, RawRoundingMode, RawSetpCompareOp, ScalarType, SetpBoolPostOp,
    StateSpace, VectorPrefix,
 };
-use crate::{Mul24Control, PtxError, PtxParserState, Reduction, ShuffleMode};
+use crate::{
+    FunnelShiftMode, Mul24Control, PtxError, PtxParserState, Reduction, ShiftDirection, ShuffleMode,
+};
 use bitflags::bitflags;
 use std::{alloc::Layout, cmp::Ordering, num::NonZeroU8};

@ -549,6 +551,16 @@ ptx_parser_macros::generate_instruction_type!(
                src_membermask: T
            }
        },
+        Shf {
+            data: ShfDetails,
+            type: Type::Scalar(ScalarType::B32),
+            arguments<T>: {
+                dst: T,
+                src_a: T,
+                src_b: T,
+                src_c: T
+            }
+        },
        Shl {
            data: ScalarType,
            type: { Type::Scalar(data.clone()) },
@ -1103,6 +1115,11 @@ pub struct CpAsyncDetails {
    pub src_size: Option<u64>,
 }

+pub struct ShfDetails {
+    pub direction: ShiftDirection,
+    pub mode: FunnelShiftMode,
+}
+
 #[derive(Clone)]
 pub enum ParsedOperand<Ident> {
    Reg(Ident),
--- a/ptx_parser/src/lib.rs
+++ b/ptx_parser/src/lib.rs
@ -1739,6 +1739,12 @@ derive_parser!(
    #[derive(Copy, Clone, PartialEq, Eq, Hash)]
    pub enum ShuffleMode { }

+    #[derive(Copy, Clone, PartialEq, Eq, Hash)]
+    pub enum ShiftDirection { }
+
+    #[derive(Copy, Clone, PartialEq, Eq, Hash)]
+    pub enum FunnelShiftMode { }
+
    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov
    mov{.vec}.type  d, a => {
        Instruction::Mov {
@ -3625,6 +3631,17 @@ derive_parser!(
    cp.async.wait_all => {
        Instruction::CpAsyncWaitAll {}
    }
+
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-shf
+    shf.dir.mode.b32  d, a, b, c => {
+        Instruction::Shf {
+            data: ShfDetails { direction: dir, mode: mode },
+            arguments: ShfArgs { dst: d, src_a: a, src_b: b, src_c: c }
+        }
+    }
+
+    .dir: ShiftDirection = { .l, .r };
+    .mode: FunnelShiftMode = { .clamp, .wrap };
 );

 #[cfg(test)]