Add support for shfl.sync.MODE.b32 (#409)

2025-07-20 10:46:21 +03:00 · 2025-07-16 17:23:11 -07:00
parent 36f0ba9cbb
commit dc69808e54
20 changed files with 623 additions and 4 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,3 +3,5 @@ Cargo.lock
 .vscode/
 .idea/
 ptx/lib/zluda_ptx_impl.ll
--- a/ptx/lib/zluda_ptx_impl.bc
+++ b/ptx/lib/zluda_ptx_impl.bc
--- a/ptx/lib/zluda_ptx_impl.cpp
+++ b/ptx/lib/zluda_ptx_impl.cpp
@ -4,6 +4,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <hip/amd_detail/amd_device_functions.h>
 #define FUNC(NAME) __device__ __attribute__((retain)) __zluda_ptx_impl_##NAME
@ -170,6 +171,55 @@ extern "C"
    BAR_RED_IMPL(and);
    BAR_RED_IMPL(or);
    struct ShflSyncResult {
        uint32_t output;
        bool in_bounds;
    };
    // shfl.sync opts consists of two values, the warp end ID and the subsection mask.
    //
    // The current warp is partitioned into some number of subsections with a width of w. The
    // subsection mask is 32 - w, and indicates which bits of the lane id are part of the subsection
    // address. For example, if each subsection is 8 lanes wide, the subsection mask will be 24 –
    // 11000 in binary. This indicates that the two most significant bits in the 5-bit lane ID are
    // the subsection address. For example, for a lane ID 13 (0b01101) the address of the beginning
    // of the subsection is 0b01000 (8).
    //
    // The warp end ID is the max lane ID for a specific mode. For the CUDA __shfl_sync
    // intrinsics, it is always 31 for idx, bfly, and down, and 0 for up. This is used for the
    // bounds check.
    #define SHFL_SYNC_IMPL(mode, calculate_index, CMP)                                                                                          \
    ShflSyncResult FUNC(shfl_sync_##mode##_b32_pred)(uint32_t input, int32_t delta, uint32_t opts, uint32_t membermask __attribute__((unused))) \
    {                                                                                                                                           \
        int32_t section_mask = (opts >> 8) & 0b11111;                                                                                           \
        int32_t warp_end = opts & 0b11111;                                                                                                      \
        int32_t self = (int32_t)__lane_id();                                                                                                    \
        int32_t subsection = section_mask & self;                                                                                               \
        int32_t subsection_end = subsection | (~section_mask & warp_end);                                                                       \
        int32_t idx = calculate_index;                                                                                                          \
        bool out_of_bounds = idx CMP subsection_end;                                                                                            \
        if (out_of_bounds) {                                                                                                                    \
            idx = self;                                                                                                                         \
        }                                                                                                                                       \
        int32_t output = __builtin_amdgcn_ds_bpermute(idx<<2, (int32_t)input);                                                                  \
        return {(uint32_t)output, !out_of_bounds};                                                                                              \
    }                                                                                                                                           \
                                                                                                                                                \
    uint32_t FUNC(shfl_sync_##mode##_b32)(uint32_t input, int32_t delta, uint32_t opts, uint32_t membermask)                                    \
    {                                                                                                                                           \
        return __zluda_ptx_impl_shfl_sync_##mode##_b32_pred(input, delta, opts, membermask).output;                                             \
    }
    // We are using the HIP __shfl intrinsics to implement these, rather than the __shfl_sync
    // intrinsics, as those only add an assertion checking that the membermask is used correctly.
    // They do not return the result of the range check, so we must replicate that logic here.
    SHFL_SYNC_IMPL(up,   self - delta,                         <);
    SHFL_SYNC_IMPL(down, self + delta,                         >);
    SHFL_SYNC_IMPL(bfly, self ^ delta,                         >);
    SHFL_SYNC_IMPL(idx,  (delta & ~section_mask) | subsection, >);
    void FUNC(__assertfail)(uint64_t message,
                            uint64_t file,
                            uint32_t line,
--- a/ptx/src/pass/emit_llvm.rs
+++ b/ptx/src/pass/emit_llvm.rs
@ -641,7 +641,8 @@ impl<'a> MethodEmitContext<'a> {
            | ast::Instruction::Bar { .. }
            | ast::Instruction::BarRed { .. }
            | ast::Instruction::Bfi { .. }
-            | ast::Instruction::Activemask { .. } => return Err(error_unreachable()),
+            | ast::Instruction::Activemask { .. }
            | ast::Instruction::ShflSync { .. } => return Err(error_unreachable()),
        }
    }
--- a/ptx/src/pass/insert_post_saturation.rs
+++ b/ptx/src/pass/insert_post_saturation.rs
@ -165,6 +165,7 @@ fn run_instruction<'input>(
        | ast::Instruction::Selp { .. }
        | ast::Instruction::Setp { .. }
        | ast::Instruction::SetpBool { .. }
        | ast::Instruction::ShflSync { .. }
        | ast::Instruction::Shl { .. }
        | ast::Instruction::Shr { .. }
        | ast::Instruction::Sin { .. }
--- a/ptx/src/pass/instruction_mode_to_global_mode/mod.rs
+++ b/ptx/src/pass/instruction_mode_to_global_mode/mod.rs
@ -1800,6 +1800,7 @@ fn get_modes<T: ast::Operand>(inst: &ast::Instruction<T>) -> InstructionModes {
        | ast::Instruction::Bfe { .. }
        | ast::Instruction::Bfi { .. }
        | ast::Instruction::Shr { .. }
        | ast::Instruction::ShflSync { .. }
        | ast::Instruction::Shl { .. }
        | ast::Instruction::Selp { .. }
        | ast::Instruction::Ret { .. }
--- a/ptx/src/pass/replace_instructions_with_function_calls.rs
+++ b/ptx/src/pass/replace_instructions_with_function_calls.rs
@ -118,6 +118,25 @@ fn run_instruction<'input>(
            };
            to_call(resolver, fn_declarations, name.into(), ptx_parser::Instruction::BarRed { data, arguments })?
        }
        ptx_parser::Instruction::ShflSync { data, arguments } => {
            let mode = match data.mode {
                ptx_parser::ShuffleMode::Up => "up",
                ptx_parser::ShuffleMode::Down => "down",
                ptx_parser::ShuffleMode::BFly => "bfly",
                ptx_parser::ShuffleMode::Idx => "idx",
            };
            let pred = if arguments.dst_pred.is_some() {
                "_pred"
            } else {
                ""
            };
            to_call(
                resolver,
                fn_declarations,
                format!("shfl_sync_{}_b32{}", mode, pred).into(),
                ptx_parser::Instruction::ShflSync { data, arguments },
            )?
        }
        i => i,
    })
 }
--- a/ptx/src/test/ll/shfl_sync_bfly_b32_pred.ll
+++ b/ptx/src/test/ll/shfl_sync_bfly_b32_pred.ll
@ -0,0 +1,59 @@
 declare [2 x i32] @__zluda_ptx_impl_shfl_sync_bfly_b32_pred(i32, i32, i32, i32) #0
 declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
 define amdgpu_kernel void @shfl_sync_bfly_b32_pred(ptr addrspace(4) byref(i64) %"42") #1 {
  %"43" = alloca i64, align 8, addrspace(5)
  %"44" = alloca i64, align 8, addrspace(5)
  %"45" = alloca i32, align 4, addrspace(5)
  %"46" = alloca i32, align 4, addrspace(5)
  %"47" = alloca i1, align 1, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"39"
 "39":                                             ; preds = %1
  %"48" = load i64, ptr addrspace(4) %"42", align 4
  store i64 %"48", ptr addrspace(5) %"43", align 4
  %"33" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0)
  br label %"40"
 "40":                                             ; preds = %"39"
  store i32 %"33", ptr addrspace(5) %"45", align 4
  %"52" = load i32, ptr addrspace(5) %"45", align 4
  %2 = call [2 x i32] @__zluda_ptx_impl_shfl_sync_bfly_b32_pred(i32 %"52", i32 3, i32 31, i32 -1)
  %"65" = extractvalue [2 x i32] %2, 0
  %3 = extractvalue [2 x i32] %2, 1
  %"51" = trunc i32 %3 to i1
  store i32 %"65", ptr addrspace(5) %"46", align 4
  store i1 %"51", ptr addrspace(5) %"47", align 1
  %"53" = load i1, ptr addrspace(5) %"47", align 1
  br i1 %"53", label %"15", label %"14"
 "14":                                             ; preds = %"40"
  %"55" = load i32, ptr addrspace(5) %"46", align 4
  %"54" = add i32 %"55", 1000
  store i32 %"54", ptr addrspace(5) %"46", align 4
  br label %"15"
 "15":                                             ; preds = %"14", %"40"
  %"57" = load i32, ptr addrspace(5) %"45", align 4
  %"56" = zext i32 %"57" to i64
  store i64 %"56", ptr addrspace(5) %"44", align 4
  %"59" = load i64, ptr addrspace(5) %"44", align 4
  %"58" = mul i64 %"59", 4
  store i64 %"58", ptr addrspace(5) %"44", align 4
  %"61" = load i64, ptr addrspace(5) %"43", align 4
  %"62" = load i64, ptr addrspace(5) %"44", align 4
  %"60" = add i64 %"61", %"62"
  store i64 %"60", ptr addrspace(5) %"43", align 4
  %"63" = load i64, ptr addrspace(5) %"43", align 4
  %"64" = load i32, ptr addrspace(5) %"46", align 4
  %"67" = inttoptr i64 %"63" to ptr
  store i32 %"64", ptr %"67", align 4
  ret void
 }
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
 attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
--- a/ptx/src/test/ll/shfl_sync_down_b32_pred.ll
+++ b/ptx/src/test/ll/shfl_sync_down_b32_pred.ll
@ -0,0 +1,59 @@
 declare [2 x i32] @__zluda_ptx_impl_shfl_sync_down_b32_pred(i32, i32, i32, i32) #0
 declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
 define amdgpu_kernel void @shfl_sync_down_b32_pred(ptr addrspace(4) byref(i64) %"42") #1 {
  %"43" = alloca i64, align 8, addrspace(5)
  %"44" = alloca i64, align 8, addrspace(5)
  %"45" = alloca i32, align 4, addrspace(5)
  %"46" = alloca i32, align 4, addrspace(5)
  %"47" = alloca i1, align 1, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"39"
 "39":                                             ; preds = %1
  %"48" = load i64, ptr addrspace(4) %"42", align 4
  store i64 %"48", ptr addrspace(5) %"43", align 4
  %"33" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0)
  br label %"40"
 "40":                                             ; preds = %"39"
  store i32 %"33", ptr addrspace(5) %"45", align 4
  %"52" = load i32, ptr addrspace(5) %"45", align 4
  %2 = call [2 x i32] @__zluda_ptx_impl_shfl_sync_down_b32_pred(i32 %"52", i32 3, i32 31, i32 -1)
  %"65" = extractvalue [2 x i32] %2, 0
  %3 = extractvalue [2 x i32] %2, 1
  %"51" = trunc i32 %3 to i1
  store i32 %"65", ptr addrspace(5) %"46", align 4
  store i1 %"51", ptr addrspace(5) %"47", align 1
  %"53" = load i1, ptr addrspace(5) %"47", align 1
  br i1 %"53", label %"15", label %"14"
 "14":                                             ; preds = %"40"
  %"55" = load i32, ptr addrspace(5) %"46", align 4
  %"54" = add i32 %"55", 1000
  store i32 %"54", ptr addrspace(5) %"46", align 4
  br label %"15"
 "15":                                             ; preds = %"14", %"40"
  %"57" = load i32, ptr addrspace(5) %"45", align 4
  %"56" = zext i32 %"57" to i64
  store i64 %"56", ptr addrspace(5) %"44", align 4
  %"59" = load i64, ptr addrspace(5) %"44", align 4
  %"58" = mul i64 %"59", 4
  store i64 %"58", ptr addrspace(5) %"44", align 4
  %"61" = load i64, ptr addrspace(5) %"43", align 4
  %"62" = load i64, ptr addrspace(5) %"44", align 4
  %"60" = add i64 %"61", %"62"
  store i64 %"60", ptr addrspace(5) %"43", align 4
  %"63" = load i64, ptr addrspace(5) %"43", align 4
  %"64" = load i32, ptr addrspace(5) %"46", align 4
  %"67" = inttoptr i64 %"63" to ptr
  store i32 %"64", ptr %"67", align 4
  ret void
 }
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
 attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
--- a/ptx/src/test/ll/shfl_sync_idx_b32_pred.ll
+++ b/ptx/src/test/ll/shfl_sync_idx_b32_pred.ll
@ -0,0 +1,59 @@
 declare [2 x i32] @__zluda_ptx_impl_shfl_sync_idx_b32_pred(i32, i32, i32, i32) #0
 declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
 define amdgpu_kernel void @shfl_sync_idx_b32_pred(ptr addrspace(4) byref(i64) %"42") #1 {
  %"43" = alloca i64, align 8, addrspace(5)
  %"44" = alloca i64, align 8, addrspace(5)
  %"45" = alloca i32, align 4, addrspace(5)
  %"46" = alloca i32, align 4, addrspace(5)
  %"47" = alloca i1, align 1, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"39"
 "39":                                             ; preds = %1
  %"48" = load i64, ptr addrspace(4) %"42", align 4
  store i64 %"48", ptr addrspace(5) %"43", align 4
  %"33" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0)
  br label %"40"
 "40":                                             ; preds = %"39"
  store i32 %"33", ptr addrspace(5) %"45", align 4
  %"52" = load i32, ptr addrspace(5) %"45", align 4
  %2 = call [2 x i32] @__zluda_ptx_impl_shfl_sync_idx_b32_pred(i32 %"52", i32 12, i32 31, i32 -1)
  %"65" = extractvalue [2 x i32] %2, 0
  %3 = extractvalue [2 x i32] %2, 1
  %"51" = trunc i32 %3 to i1
  store i32 %"65", ptr addrspace(5) %"46", align 4
  store i1 %"51", ptr addrspace(5) %"47", align 1
  %"53" = load i1, ptr addrspace(5) %"47", align 1
  br i1 %"53", label %"15", label %"14"
 "14":                                             ; preds = %"40"
  %"55" = load i32, ptr addrspace(5) %"46", align 4
  %"54" = add i32 %"55", 1000
  store i32 %"54", ptr addrspace(5) %"46", align 4
  br label %"15"
 "15":                                             ; preds = %"14", %"40"
  %"57" = load i32, ptr addrspace(5) %"45", align 4
  %"56" = zext i32 %"57" to i64
  store i64 %"56", ptr addrspace(5) %"44", align 4
  %"59" = load i64, ptr addrspace(5) %"44", align 4
  %"58" = mul i64 %"59", 4
  store i64 %"58", ptr addrspace(5) %"44", align 4
  %"61" = load i64, ptr addrspace(5) %"43", align 4
  %"62" = load i64, ptr addrspace(5) %"44", align 4
  %"60" = add i64 %"61", %"62"
  store i64 %"60", ptr addrspace(5) %"43", align 4
  %"63" = load i64, ptr addrspace(5) %"43", align 4
  %"64" = load i32, ptr addrspace(5) %"46", align 4
  %"67" = inttoptr i64 %"63" to ptr
  store i32 %"64", ptr %"67", align 4
  ret void
 }
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
 attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
--- a/ptx/src/test/ll/shfl_sync_mode_b32.ll
+++ b/ptx/src/test/ll/shfl_sync_mode_b32.ll
@ -0,0 +1,74 @@
 declare i32 @__zluda_ptx_impl_shfl_sync_down_b32(i32, i32, i32, i32) #0
 declare i32 @__zluda_ptx_impl_shfl_sync_up_b32(i32, i32, i32, i32) #0
 declare i32 @__zluda_ptx_impl_shfl_sync_bfly_b32(i32, i32, i32, i32) #0
 declare i32 @__zluda_ptx_impl_shfl_sync_idx_b32(i32, i32, i32, i32) #0
 declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
 define amdgpu_kernel void @shfl_sync_mode_b32(ptr addrspace(4) byref(i64) %"48") #1 {
  %"49" = alloca i64, align 8, addrspace(5)
  %"50" = alloca i64, align 8, addrspace(5)
  %"51" = alloca i32, align 4, addrspace(5)
  %"52" = alloca i32, align 4, addrspace(5)
  %"53" = alloca i32, align 4, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"45"
 "45":                                             ; preds = %1
  %"54" = load i64, ptr addrspace(4) %"48", align 4
  store i64 %"54", ptr addrspace(5) %"49", align 4
  %"31" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0)
  br label %"46"
 "46":                                             ; preds = %"45"
  store i32 %"31", ptr addrspace(5) %"51", align 4
  %"57" = load i32, ptr addrspace(5) %"51", align 4
  %"84" = call i32 @__zluda_ptx_impl_shfl_sync_up_b32(i32 %"57", i32 3, i32 7680, i32 -1)
  store i32 %"84", ptr addrspace(5) %"52", align 4
  %"59" = load i32, ptr addrspace(5) %"52", align 4
  store i32 %"59", ptr addrspace(5) %"53", align 4
  %"61" = load i32, ptr addrspace(5) %"51", align 4
  %"86" = call i32 @__zluda_ptx_impl_shfl_sync_down_b32(i32 %"61", i32 3, i32 7199, i32 -1)
  store i32 %"86", ptr addrspace(5) %"52", align 4
  %"63" = load i32, ptr addrspace(5) %"53", align 4
  %"64" = load i32, ptr addrspace(5) %"52", align 4
  %"62" = add i32 %"63", %"64"
  store i32 %"62", ptr addrspace(5) %"53", align 4
  %"66" = load i32, ptr addrspace(5) %"51", align 4
  %"88" = call i32 @__zluda_ptx_impl_shfl_sync_bfly_b32(i32 %"66", i32 3, i32 6175, i32 -1)
  store i32 %"88", ptr addrspace(5) %"52", align 4
  %"68" = load i32, ptr addrspace(5) %"53", align 4
  %"69" = load i32, ptr addrspace(5) %"52", align 4
  %"67" = add i32 %"68", %"69"
  store i32 %"67", ptr addrspace(5) %"53", align 4
  %"71" = load i32, ptr addrspace(5) %"51", align 4
  %"90" = call i32 @__zluda_ptx_impl_shfl_sync_idx_b32(i32 %"71", i32 3, i32 4127, i32 -1)
  store i32 %"90", ptr addrspace(5) %"52", align 4
  %"73" = load i32, ptr addrspace(5) %"53", align 4
  %"74" = load i32, ptr addrspace(5) %"52", align 4
  %"72" = add i32 %"73", %"74"
  store i32 %"72", ptr addrspace(5) %"53", align 4
  %"76" = load i32, ptr addrspace(5) %"51", align 4
  %"75" = zext i32 %"76" to i64
  store i64 %"75", ptr addrspace(5) %"50", align 4
  %"78" = load i64, ptr addrspace(5) %"50", align 4
  %"77" = mul i64 %"78", 4
  store i64 %"77", ptr addrspace(5) %"50", align 4
  %"80" = load i64, ptr addrspace(5) %"49", align 4
  %"81" = load i64, ptr addrspace(5) %"50", align 4
  %"79" = add i64 %"80", %"81"
  store i64 %"79", ptr addrspace(5) %"49", align 4
  %"82" = load i64, ptr addrspace(5) %"49", align 4
  %"83" = load i32, ptr addrspace(5) %"53", align 4
  %"92" = inttoptr i64 %"82" to ptr
  store i32 %"83", ptr %"92", align 4
  ret void
 }
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
 attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
--- a/ptx/src/test/ll/shfl_sync_up_b32_pred.ll
+++ b/ptx/src/test/ll/shfl_sync_up_b32_pred.ll
@ -0,0 +1,59 @@
 declare [2 x i32] @__zluda_ptx_impl_shfl_sync_up_b32_pred(i32, i32, i32, i32) #0
 declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
 define amdgpu_kernel void @shfl_sync_up_b32_pred(ptr addrspace(4) byref(i64) %"42") #1 {
  %"43" = alloca i64, align 8, addrspace(5)
  %"44" = alloca i64, align 8, addrspace(5)
  %"45" = alloca i32, align 4, addrspace(5)
  %"46" = alloca i32, align 4, addrspace(5)
  %"47" = alloca i1, align 1, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"39"
 "39":                                             ; preds = %1
  %"48" = load i64, ptr addrspace(4) %"42", align 4
  store i64 %"48", ptr addrspace(5) %"43", align 4
  %"33" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0)
  br label %"40"
 "40":                                             ; preds = %"39"
  store i32 %"33", ptr addrspace(5) %"45", align 4
  %"52" = load i32, ptr addrspace(5) %"45", align 4
  %2 = call [2 x i32] @__zluda_ptx_impl_shfl_sync_up_b32_pred(i32 %"52", i32 3, i32 0, i32 -1)
  %"65" = extractvalue [2 x i32] %2, 0
  %3 = extractvalue [2 x i32] %2, 1
  %"51" = trunc i32 %3 to i1
  store i32 %"65", ptr addrspace(5) %"46", align 4
  store i1 %"51", ptr addrspace(5) %"47", align 1
  %"53" = load i1, ptr addrspace(5) %"47", align 1
  br i1 %"53", label %"15", label %"14"
 "14":                                             ; preds = %"40"
  %"55" = load i32, ptr addrspace(5) %"46", align 4
  %"54" = add i32 %"55", 1000
  store i32 %"54", ptr addrspace(5) %"46", align 4
  br label %"15"
 "15":                                             ; preds = %"14", %"40"
  %"57" = load i32, ptr addrspace(5) %"45", align 4
  %"56" = zext i32 %"57" to i64
  store i64 %"56", ptr addrspace(5) %"44", align 4
  %"59" = load i64, ptr addrspace(5) %"44", align 4
  %"58" = mul i64 %"59", 4
  store i64 %"58", ptr addrspace(5) %"44", align 4
  %"61" = load i64, ptr addrspace(5) %"43", align 4
  %"62" = load i64, ptr addrspace(5) %"44", align 4
  %"60" = add i64 %"61", %"62"
  store i64 %"60", ptr addrspace(5) %"43", align 4
  %"63" = load i64, ptr addrspace(5) %"43", align 4
  %"64" = load i32, ptr addrspace(5) %"46", align 4
  %"67" = inttoptr i64 %"63" to ptr
  store i32 %"64", ptr %"67", align 4
  ret void
 }
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
 attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
--- a/ptx/src/test/spirv_run/mod.rs
+++ b/ptx/src/test/spirv_run/mod.rs
@ -315,6 +315,36 @@ test_ptx_warp!(bar_red_and_pred, [
    2u32, 2u32, 2u32, 2u32, 2u32, 2u32, 2u32, 2u32, 2u32, 2u32, 2u32, 2u32, 2u32, 2u32, 2u32, 2u32,
    2u32, 2u32, 2u32, 2u32, 2u32, 2u32, 2u32, 2u32, 2u32, 2u32, 2u32, 2u32, 2u32, 2u32, 2u32, 2u32,
 ]);
 test_ptx_warp!(shfl_sync_up_b32_pred, [
    1000u32, 1001u32, 1002u32,    0u32,    1u32,    2u32,    3u32,    4u32,    5u32,    6u32,    7u32,    8u32,    9u32,   10u32,   11u32,   12u32,
      13u32,   14u32,   15u32,   16u32,   17u32,   18u32,   19u32,   20u32,   21u32,   22u32,   23u32,   24u32,   25u32,   26u32,   27u32,   28u32,
    1032u32, 1033u32, 1034u32,   32u32,   33u32,   34u32,   35u32,   36u32,   37u32,   38u32,   39u32,   40u32,   41u32,   42u32,   43u32,   44u32,
      45u32,   46u32,   47u32,   48u32,   49u32,   50u32,   51u32,   52u32,   53u32,   54u32,   55u32,   56u32,   57u32,   58u32,   59u32,   60u32,
 ]);
 test_ptx_warp!(shfl_sync_down_b32_pred, [
     3u32,    4u32,    5u32,    6u32,    7u32,    8u32,    9u32,   10u32,   11u32,   12u32,   13u32,   14u32,   15u32,   16u32,   17u32,   18u32,
    19u32,   20u32,   21u32,   22u32,   23u32,   24u32,   25u32,   26u32,   27u32,   28u32,   29u32,   30u32,   31u32, 1029u32, 1030u32, 1031u32,
    35u32,   36u32,   37u32,   38u32,   39u32,   40u32,   41u32,   42u32,   43u32,   44u32,   45u32,   46u32,   47u32,   48u32,   49u32,   50u32,
    51u32,   52u32,   53u32,   54u32,   55u32,   56u32,   57u32,   58u32,   59u32,   60u32,   61u32,   62u32,   63u32, 1061u32, 1062u32, 1063u32,
 ]);
 test_ptx_warp!(shfl_sync_bfly_b32_pred, [
     3u32,  2u32,  1u32,  0u32,  7u32,  6u32,  5u32,  4u32, 11u32, 10u32,  9u32,  8u32, 15u32, 14u32, 13u32, 12u32,
    19u32, 18u32, 17u32, 16u32, 23u32, 22u32, 21u32, 20u32, 27u32, 26u32, 25u32, 24u32, 31u32, 30u32, 29u32, 28u32,
    35u32, 34u32, 33u32, 32u32, 39u32, 38u32, 37u32, 36u32, 43u32, 42u32, 41u32, 40u32, 47u32, 46u32, 45u32, 44u32,
    51u32, 50u32, 49u32, 48u32, 55u32, 54u32, 53u32, 52u32, 59u32, 58u32, 57u32, 56u32, 63u32, 62u32, 61u32, 60u32,
 ]);
 test_ptx_warp!(shfl_sync_idx_b32_pred, [
    12u32, 12u32, 12u32, 12u32, 12u32, 12u32, 12u32, 12u32, 12u32, 12u32, 12u32, 12u32, 12u32, 12u32, 12u32, 12u32,
    12u32, 12u32, 12u32, 12u32, 12u32, 12u32, 12u32, 12u32, 12u32, 12u32, 12u32, 12u32, 12u32, 12u32, 12u32, 12u32,
    44u32, 44u32, 44u32, 44u32, 44u32, 44u32, 44u32, 44u32, 44u32, 44u32, 44u32, 44u32, 44u32, 44u32, 44u32, 44u32,
    44u32, 44u32, 44u32, 44u32, 44u32, 44u32, 44u32, 44u32, 44u32, 44u32, 44u32, 44u32, 44u32, 44u32, 44u32, 44u32,
 ]);
 test_ptx_warp!(shfl_sync_mode_b32, [
      9u32,   7u32,   8u32,   9u32,  21u32,  19u32,  20u32,  21u32,  33u32,  31u32,  32u32,  33u32,  45u32,  43u32,  44u32,  45u32,  
     73u32,  71u32,  72u32,  73u32,  85u32,  83u32,  84u32,  85u32,  97u32,  95u32,  96u32,  97u32, 109u32, 107u32, 108u32, 109u32,
    137u32, 135u32, 136u32, 137u32, 149u32, 147u32, 148u32, 149u32, 161u32, 159u32, 160u32, 161u32, 173u32, 171u32, 172u32, 173u32,
    201u32, 199u32, 200u32, 201u32, 213u32, 211u32, 212u32, 213u32, 225u32, 223u32, 224u32, 225u32, 237u32, 235u32, 236u32, 237u32,
 ]);
 struct DisplayError<T: Debug> {
    err: T,
--- a/ptx/src/test/spirv_run/shfl_sync_bfly_b32_pred.ptx
+++ b/ptx/src/test/spirv_run/shfl_sync_bfly_b32_pred.ptx
@ -0,0 +1,32 @@
 .version 6.5
 .target sm_30
 .address_size 64
 .visible .entry shfl_sync_bfly_b32_pred(
    .param .u64 output
 )
 {
    .reg .u64 	      out_addr;
    .reg .u64 	      out_index;
    .reg .u32         thread_id;
    .reg .u32         result;
    .reg .pred        in_range;
    ld.param.u64  	  out_addr, [output];
    mov.u32           thread_id, %tid.x;
    // result = __shfl_xor_sync(mask=0xFFFFFFFF, thread_id, delta=3, width=32)
    // c is ((32-width) << 8) | 31
    shfl.sync.bfly.b32 result|in_range, thread_id, 3, 31, 0xFFFFFFFF;
    @!in_range add.u32 result, result, 1000;
    // Return result
    cvt.u64.u32       out_index, thread_id;
    mul.lo.u64        out_index, out_index, 4;
    add.u64           out_addr, out_addr, out_index;
    st.u32            [out_addr], result;
    ret;
 }
--- a/ptx/src/test/spirv_run/shfl_sync_down_b32_pred.ptx
+++ b/ptx/src/test/spirv_run/shfl_sync_down_b32_pred.ptx
@ -0,0 +1,32 @@
 .version 6.5
 .target sm_30
 .address_size 64
 .visible .entry shfl_sync_down_b32_pred(
    .param .u64 output
 )
 {
    .reg .u64 	      out_addr;
    .reg .u64 	      out_index;
    .reg .u32         thread_id;
    .reg .u32         result;
    .reg .pred        in_range;
    ld.param.u64  	  out_addr, [output];
    mov.u32           thread_id, %tid.x;
    // result = __shfl_down_sync(mask=0xFFFFFFFF, thread_id, delta=3, width=32)
    // c is ((32-width) << 8) | 31
    shfl.sync.down.b32 result|in_range, thread_id, 3, 31, 0xFFFFFFFF;
        @!in_range add.u32 result, result, 1000;
    // Return result
    cvt.u64.u32       out_index, thread_id;
    mul.lo.u64        out_index, out_index, 4;
    add.u64           out_addr, out_addr, out_index;
    st.u32            [out_addr], result;
    ret;
 }
--- a/ptx/src/test/spirv_run/shfl_sync_idx_b32_pred.ptx
+++ b/ptx/src/test/spirv_run/shfl_sync_idx_b32_pred.ptx
@ -0,0 +1,32 @@
 .version 6.5
 .target sm_30
 .address_size 64
 .visible .entry shfl_sync_idx_b32_pred(
    .param .u64 output
 )
 {
    .reg .u64 	      out_addr;
    .reg .u64 	      out_index;
    .reg .u32         thread_id;
    .reg .u32         result;
    .reg .pred        in_range;
    ld.param.u64  	  out_addr, [output];
    mov.u32           thread_id, %tid.x;
    // result = __shfl_sync(mask=0xFFFFFFFF, thread_id, srcLane=12, width=32)
    // c is ((32-width) << 8) | 31
    shfl.sync.idx.b32 result|in_range, thread_id, 12, 31, 0xFFFFFFFF;
    @!in_range add.u32 result, result, 1000;
    // Return result
    cvt.u64.u32       out_index, thread_id;
    mul.lo.u64        out_index, out_index, 4;
    add.u64           out_addr, out_addr, out_index;
    st.u32            [out_addr], result;
    ret;
 }
--- a/ptx/src/test/spirv_run/shfl_sync_mode_b32.ptx
+++ b/ptx/src/test/spirv_run/shfl_sync_mode_b32.ptx
@ -0,0 +1,46 @@
 .version 6.5
 .target sm_30
 .address_size 64
 .visible .entry shfl_sync_mode_b32(
    .param .u64 output
 )
 {
    .reg .u64 	      out_addr;
    .reg .u64 	      out_index;
    .reg .u32         thread_id;
    .reg .u32         temp;
    .reg .u32         result;
    ld.param.u64  	  out_addr, [output];
    mov.u32           thread_id, %tid.x;
    // result = __shfl_up_sync(mask=0xFFFFFFFF, thread_id, delta=3, width=2)
    // c is ((32-width) << 8)
    shfl.sync.up.b32 temp, thread_id, 3, 7680, 0xFFFFFFFF;
    mov.u32 result, temp;
    // result += __shfl_down_sync(mask=0xFFFFFFFF, thread_id, delta=3, width=4)
    // c is ((32-width) << 8) | 31
    shfl.sync.down.b32 temp, thread_id, 3, 7199, 0xFFFFFFFF;
    add.u32 result, result, temp;
    // result = __shfl_xor_sync(mask=0xFFFFFFFF, thread_id, delta=3, width=8)
    // c is ((32-width) << 8) | 31
    shfl.sync.bfly.b32 temp, thread_id, 3, 6175, 0xFFFFFFFF;
    add.u32 result, result, temp;
    // result = __shfl_sync(mask=0xFFFFFFFF, thread_id, delta=3, width=16)
    // c is ((32-width) << 8) | 31
    shfl.sync.idx.b32 temp, thread_id, 3, 4127, 0xFFFFFFFF;
    add.u32 result, result, temp;
    // Return result
    cvt.u64.u32       out_index, thread_id;
    mul.lo.u64        out_index, out_index, 4;
    add.u64           out_addr, out_addr, out_index;
    st.u32            [out_addr], result;
    ret;
 }
--- a/ptx/src/test/spirv_run/shfl_sync_up_b32_pred.ptx
+++ b/ptx/src/test/spirv_run/shfl_sync_up_b32_pred.ptx
@ -0,0 +1,32 @@
 .version 6.5
 .target sm_30
 .address_size 64
 .visible .entry shfl_sync_up_b32_pred(
    .param .u64 output
 )
 {
    .reg .u64 	      out_addr;
    .reg .u64 	      out_index;
    .reg .u32         thread_id;
    .reg .u32         result;
    .reg .pred        in_range;
    ld.param.u64  	  out_addr, [output];
    mov.u32           thread_id, %tid.x;
    // result = __shfl_up_sync(mask=0xFFFFFFFF, thread_id, delta=3, width=32)
    // c is ((32-width) << 8)
    shfl.sync.up.b32 result|in_range, thread_id, 3, 0, 0xFFFFFFFF;
    @!in_range add.u32 result, result, 1000;
    // Return result
    cvt.u64.u32       out_index, thread_id;
    mul.lo.u64        out_index, out_index, 4;
    add.u64           out_addr, out_addr, out_index;
    st.u32            [out_addr], result;
    ret;
 }
--- a/ptx_parser/src/ast.rs
+++ b/ptx_parser/src/ast.rs
@ -2,7 +2,7 @@ use super::{
    AtomSemantics, MemScope, RawRoundingMode, RawSetpCompareOp, ScalarType, SetpBoolPostOp,
    StateSpace, VectorPrefix,
 };
-use crate::{Mul24Control, Reduction, PtxError, PtxParserState};
+use crate::{Mul24Control, Reduction, PtxError, PtxParserState, ShuffleMode};
 use bitflags::bitflags;
 use std::{alloc::Layout, cmp::Ordering, num::NonZeroU8};
@ -468,6 +468,21 @@ ptx_parser_macros::generate_instruction_type!(
                }
            }
        },
        ShflSync {
            data: ShflSyncDetails,
            type: Type::Scalar(ScalarType::B32),
            arguments<T>: {
                dst: T,
                dst_pred: {
                    repr: Option<T>,
                    type: Type::from(ScalarType::Pred)
                },
                src: T,
                src_lane: T,
                src_opts: T,
                src_membermask: T
            }
        },
        Shl {
            data: ScalarType,
            type: { Type::Scalar(data.clone()) },
@ -979,6 +994,11 @@ impl MovDetails {
    }
 }
 #[derive(Copy, Clone)]
 pub struct ShflSyncDetails {
    pub mode: ShuffleMode,
 }
 #[derive(Clone)]
 pub enum ParsedOperand<Ident> {
    Reg(Ident),
--- a/ptx_parser/src/lib.rs
+++ b/ptx_parser/src/lib.rs
@ -1589,7 +1589,7 @@ where
 //   * Opcode: `ld`
 //   * Modifiers, always start with a dot: `.global`, `.relaxed`. Optionals are enclosed in braces
 //   * Arguments: `a`, `b`. Optionals are enclosed in braces
-//   * Code block: => { <code expression> }. Code blocks implictly take all modifiers ansd arguments
+//   * Code block: => { <code expression> }. Code blocks implictly take all modifiers and arguments
 //     as parameters. All modifiers and arguments are passed to the code block:
 //     * If it is an alternative (as defined in rules list later):
 //       * If it is mandatory then its type is Foo (as defined by the relevant rule)
@ -1723,6 +1723,9 @@ derive_parser!(
    #[derive(Copy, Clone, PartialEq, Eq, Hash)]
    pub enum Reduction { }
    #[derive(Copy, Clone, PartialEq, Eq, Hash)]
    pub enum ShuffleMode { }
    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov
    mov{.vec}.type  d, a => {
        Instruction::Mov {
@ -3487,6 +3490,14 @@ derive_parser!(
    .mode: Mul24Control = { .hi, .lo };
    .type: ScalarType = { .u32, .s32 };
    // https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-shfl-sync
    shfl.sync.mode.b32  d[|p], a, b, c, membermask => {
        Instruction::ShflSync  {
            data: ast::ShflSyncDetails { mode },
            arguments: ShflSyncArgs { dst: d, dst_pred: p, src: a, src_lane: b, src_opts: c, src_membermask: membermask }
        }
    }
    .mode: ShuffleMode = { .up, .down, .bfly, .idx };
 );
 #[cfg(test)]