ZLUDA/ptx/src/ptx.lalrpop

use crate::ast;
use crate::ast::UnwrapWithVec;
use crate::{without_none, vector_index};

grammar<'a>(errors: &mut Vec<ast::PtxError>);

extern {
    type Error = ast::PtxError;
}

match {
    r"\s+" => { },
    r"//[^\n\r]*[\n\r]*" => { },
    r"/\*([^\*]*\*+[^\*/])*([^\*]*\*+|[^\*])*\*/" => { },
    r"-?[?:0x]?[0-9]+" => Num,
    r#""[^"]*""# => String,
    r"[0-9]+\.[0-9]+" => VersionNumber,
    "!",
    "(", ")",
    "+",
    ",",
    ".",
    ":",
    ";",
    "@",
    "[", "]",
    "{", "}",
    "<", ">",
    "|",
    ".acquire",
    ".address_size",
    ".align",
    ".and",
    ".b16",
    ".b32",
    ".b64",
    ".b8",
    ".ca",
    ".cg",
    ".const",
    ".cs",
    ".cta",
    ".cv",
    ".entry",
    ".eq",
    ".equ",
    ".extern",
    ".f16",
    ".f16x2",
    ".f32",
    ".f64",
    ".file",
    ".ftz",
    ".func",
    ".ge",
    ".geu",
    ".global",
    ".gpu",
    ".gt",
    ".gtu",
    ".hi",
    ".hs",
    ".le",
    ".leu",
    ".lo",
    ".loc",
    ".local",
    ".ls",
    ".lt",
    ".ltu",
    ".lu",
    ".nan",
    ".ne",
    ".neu",
    ".num",
    ".or",
    ".param",
    ".pred",
    ".reg",
    ".relaxed",
    ".rm",
    ".rmi",
    ".rn",
    ".rni",
    ".rp",
    ".rpi",
    ".rz",
    ".rzi",
    ".s16",
    ".s32",
    ".s64",
    ".s8" ,
    ".sat",
    ".section",
    ".shared",
    ".sreg",
    ".sys",
    ".target",
    ".to",
    ".u16",
    ".u32",
    ".u64",
    ".u8" ,
    ".uni",
    ".v2",
    ".v4",
    ".version",
    ".visible",
    ".volatile",
    ".wb",
    ".weak",
    ".wide",
    ".wt",
    ".xor",
} else {
    // IF YOU ARE ADDING A NEW TOKEN HERE ALSO ADD IT BELOW TO ExtendedID
    "abs",
    "add",
    "bra",
    "call",
    "cvt",
    "cvta",
    "debug",
    "ld",
    "mad",
    "map_f64_to_f32",
    "mov",
    "mul",
    "not",
    "ret",
    "setp",
    "shl",
    "shr",
    r"sm_[0-9]+" => ShaderModel,
    "st",
    "texmode_independent",
    "texmode_unified",
} else {
    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#identifiers
    r"[a-zA-Z][a-zA-Z0-9_$]*|[_$%][a-zA-Z0-9_$]+" => ID,
    r"\.[a-zA-Z][a-zA-Z0-9_$]*" => DotID,
}

ExtendedID : &'input str = {
    "abs",
    "add",
    "bra",
    "call",
    "cvt",
    "cvta",
    "debug",
    "ld",
    "mad",
    "map_f64_to_f32",
    "mov",
    "mul",
    "not",
    "ret",
    "setp",
    "shl",
    "shr",
    ShaderModel,
    "st",
    "texmode_independent",
    "texmode_unified",
    ID
}

pub Module: ast::Module<'input> = {
    <v:Version> Target <f:Directive*> => {
        ast::Module { version: v, functions: without_none(f) }
    }
};

Version: (u8, u8) = {
    ".version" <v:VersionNumber> => {
        let dot = v.find('.').unwrap();
        let major = v[..dot].parse::<u8>();
        let minor = v[dot+1..].parse::<u8>();
        (major,minor).unwrap_with(errors)
    }
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#ptx-module-directives-target
Target = {
    ".target" Comma<TargetSpecifier>
};

TargetSpecifier = {
    ShaderModel,
    "texmode_unified",
    "texmode_independent",
    "debug",
    "map_f64_to_f32"
};

Directive: Option<ast::Function<'input, &'input str, ast::Statement<ast::ParsedArgParams<'input>>>> = {
    AddressSize => None,
    <f:Function> => Some(f),
    File => None,
    Section => None
};

AddressSize = {
    ".address_size" Num
};

Function: ast::Function<'input, &'input str, ast::Statement<ast::ParsedArgParams<'input>>>  = {
    LinkingDirective*
    <func_directive:MethodDecl>
    <body:FunctionBody> =>  ast::Function{<>}
};

LinkingDirective = {
    ".extern",
    ".visible",
    ".weak"
};

MethodDecl: ast::MethodDecl<'input, &'input str> = {
    ".entry" <name:ExtendedID> <params:KernelArguments> => ast::MethodDecl::Kernel(name, params),
    ".func" <ret_vals:FnArguments?> <name:ExtendedID> <params:FnArguments> => {
        ast::MethodDecl::Func(ret_vals.unwrap_or_else(|| Vec::new()), name, params)
    }
};

KernelArguments: Vec<ast::KernelArgument<&'input str>> = {
    "(" <args:Comma<KernelInput>> ")" => args
};

FnArguments: Vec<ast::FnArgument<&'input str>> = {
    "(" <args:Comma<FnInput>> ")" => args
};

KernelInput: ast::Variable<ast::VariableParamType, &'input str> = {
    <v:ParamVariable> => {
        let (align, v_type, name) = v;
        ast::Variable{ align, v_type, name }
    }
}

FnInput: ast::Variable<ast::FnArgumentType, &'input str> = {
    <v:RegVariable> => {
        let (align, v_type, name) = v;
        let v_type = ast::FnArgumentType::Reg(v_type);
        ast::Variable{ align, v_type, name }
    },
    <v:ParamVariable> => {
        let (align, v_type, name) = v;
        let v_type = ast::FnArgumentType::Param(v_type);
        ast::Variable{ align, v_type, name }
    }
}

pub(crate) FunctionBody: Option<Vec<ast::Statement<ast::ParsedArgParams<'input>>>> = {
    "{" <s:Statement*> "}" => { Some(without_none(s)) },
    ";" => { None }
};

StateSpaceSpecifier: ast::StateSpace = {
    ".reg" => ast::StateSpace::Reg,
    ".sreg" => ast::StateSpace::Sreg,
    ".const" => ast::StateSpace::Const,
    ".global" => ast::StateSpace::Global,
    ".local" => ast::StateSpace::Local,
    ".shared" => ast::StateSpace::Shared,
    ".param" => ast::StateSpace::Param, // used to prepare function call
};

ScalarType: ast::ScalarType = {
    ".f16" => ast::ScalarType::F16,
    ".f16x2" => ast::ScalarType::F16x2,
    ".pred" => ast::ScalarType::Pred,
    LdStScalarType
};

LdStScalarType: ast::ScalarType = {
    ".b8" => ast::ScalarType::B8,
    ".b16" => ast::ScalarType::B16,
    ".b32" => ast::ScalarType::B32,
    ".b64" => ast::ScalarType::B64,
    ".u8" => ast::ScalarType::U8,
    ".u16" => ast::ScalarType::U16,
    ".u32" => ast::ScalarType::U32,
    ".u64" => ast::ScalarType::U64,
    ".s8" => ast::ScalarType::S8,
    ".s16" => ast::ScalarType::S16,
    ".s32" => ast::ScalarType::S32,
    ".s64" => ast::ScalarType::S64,
    ".f32" => ast::ScalarType::F32,
    ".f64" => ast::ScalarType::F64,
};

Statement: Option<ast::Statement<ast::ParsedArgParams<'input>>> = {
    <l:Label> => Some(ast::Statement::Label(l)),
    DebugDirective => None,
    <v:MultiVariable> ";" => Some(ast::Statement::Variable(v)),
    <p:PredAt?> <i:Instruction> ";" => Some(ast::Statement::Instruction(p, i)),
    "{" <s:Statement*> "}" => Some(ast::Statement::Block(without_none(s)))
};

DebugDirective: () = {
    DebugLocation
};

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#debugging-directives-loc
DebugLocation = {
    ".loc" Num Num Num
};

Label: &'input str = {
    <id:ExtendedID> ":" => id
};

Align: u32 = {
    ".align" <a:Num> => {
        let align = a.parse::<u32>();
        align.unwrap_with(errors)
    }
};

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parameterized-variable-names
MultiVariable: ast::MultiVariable<&'input str> = {
    <var:Variable> <count:VariableParam?> => ast::MultiVariable{<>}
}

VariableParam: u32 = {
    "<" <n:Num> ">" => {
        let size = n.parse::<u32>();
        size.unwrap_with(errors)
    }
}

Variable: ast::Variable<ast::VariableType, &'input str> = {
    <v:RegVariable> => {
        let (align, v_type, name) = v;
        let v_type = ast::VariableType::Reg(v_type);
        ast::Variable {align, v_type, name}
    },
    LocalVariable,
    <v:ParamVariable> => {
        let (align, v_type, name) = v;
        let v_type = ast::VariableType::Param(v_type);
        ast::Variable {align, v_type, name}
    },
};

RegVariable: (Option<u32>, ast::VariableRegType, &'input str) = {
    ".reg" <align:Align?> <t:ScalarType> <name:ExtendedID> => {
        let v_type = ast::VariableRegType::Scalar(t);
        (align, v_type, name)
    },
    ".reg" <align:Align?> <v_len:VectorPrefix> <t:SizedScalarType> <name:ExtendedID> => {
        let v_type = ast::VariableRegType::Vector(t, v_len);
        (align, v_type, name)
    }
}

LocalVariable: ast::Variable<ast::VariableType, &'input str> = {
    ".local" <align:Align?> <t:SizedScalarType> <name:ExtendedID> => {
        let v_type = ast::VariableType::Local(ast::VariableLocalType::Scalar(t));
        ast::Variable {align, v_type, name}
    },
    ".local" <align:Align?> <v_len:VectorPrefix> <t:SizedScalarType> <name:ExtendedID> => {
        let v_type = ast::VariableType::Local(ast::VariableLocalType::Vector(t, v_len));
        ast::Variable {align, v_type, name}
    },
    ".local" <align:Align?> <t:SizedScalarType> <name:ExtendedID> <arr:ArraySpecifier> => {
        let v_type = ast::VariableType::Local(ast::VariableLocalType::Array(t, arr));
        ast::Variable {align, v_type, name}
    }
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parameter-state-space
ParamVariable: (Option<u32>, ast::VariableParamType, &'input str) = {
    ".param" <align:Align?> <t:ParamScalarType> <name:ExtendedID> => {
        let v_type = ast::VariableParamType::Scalar(t);
        (align, v_type, name)
    },
    ".param" <align:Align?> <t:SizedScalarType> <name:ExtendedID> <arr:ArraySpecifier> => {
        let v_type = ast::VariableParamType::Array(t, arr);
        (align, v_type, name)
    }
}

#[inline]
SizedScalarType: ast::SizedScalarType = {
    ".b8" => ast::SizedScalarType::B8,
    ".b16" => ast::SizedScalarType::B16,
    ".b32" => ast::SizedScalarType::B32,
    ".b64" => ast::SizedScalarType::B64,
    ".u8" => ast::SizedScalarType::U8,
    ".u16" => ast::SizedScalarType::U16,
    ".u32" => ast::SizedScalarType::U32,
    ".u64" => ast::SizedScalarType::U64,
    ".s8" => ast::SizedScalarType::S8,
    ".s16" => ast::SizedScalarType::S16,
    ".s32" => ast::SizedScalarType::S32,
    ".s64" => ast::SizedScalarType::S64,
    ".f16" => ast::SizedScalarType::F16,
    ".f16x2" => ast::SizedScalarType::F16x2,
    ".f32" => ast::SizedScalarType::F32,
    ".f64" => ast::SizedScalarType::F64,
}

#[inline]
ParamScalarType: ast::ParamScalarType = {
    ".b8" => ast::ParamScalarType::B8,
    ".b16" => ast::ParamScalarType::B16,
    ".b32" => ast::ParamScalarType::B32,
    ".b64" => ast::ParamScalarType::B64,
    ".u8" => ast::ParamScalarType::U8,
    ".u16" => ast::ParamScalarType::U16,
    ".u32" => ast::ParamScalarType::U32,
    ".u64" => ast::ParamScalarType::U64,
    ".s8" => ast::ParamScalarType::S8,
    ".s16" => ast::ParamScalarType::S16,
    ".s32" => ast::ParamScalarType::S32,
    ".s64" => ast::ParamScalarType::S64,
    ".f16" => ast::ParamScalarType::F16,
    ".f32" => ast::ParamScalarType::F32,
    ".f64" => ast::ParamScalarType::F64,
}

ArraySpecifier: u32 = {
    "[" <n:Num> "]" => {
        let size = n.parse::<u32>();
        size.unwrap_with(errors)
    }
};

Instruction: ast::Instruction<ast::ParsedArgParams<'input>> = {
    InstLd,
    InstMov,
    InstMul,
    InstAdd,
    InstSetp,
    InstNot,
    InstBra,
    InstCvt,
    InstShl,
    InstSt,
    InstRet,
    InstCvta,
    InstCall,
    InstAbs,
    InstMad
};

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld
InstLd: ast::Instruction<ast::ParsedArgParams<'input>> = {
    "ld" <q:LdStQualifier?> <ss:LdStateSpace?> <cop:LdCacheOperator?> <t:LdStType> <dst:IdOrVector> "," <src:MemoryOperand> => {
        ast::Instruction::Ld(
            ast::LdDetails {
                qualifier: q.unwrap_or(ast::LdStQualifier::Weak),
                state_space: ss.unwrap_or(ast::LdStateSpace::Generic),
                caching: cop.unwrap_or(ast::LdCacheOperator::Cached),
                typ: t
            },
            ast::Arg2Ld { dst:dst, src:src }
        )
    }
};

IdOrVector: ast::IdOrVector<&'input str> = {
    <dst:ExtendedID> => ast::IdOrVector::Reg(dst),
    <dst:VectorExtract> => ast::IdOrVector::Vec(dst)
}

OperandOrVector: ast::OperandOrVector<&'input str> = {
    <op:Operand> => ast::OperandOrVector::from(op),
    <dst:VectorExtract> => ast::OperandOrVector::Vec(dst)
}

LdStType: ast::Type = {
    <v:VectorPrefix> <t:LdStScalarType> => ast::Type::Vector(t, v),
    <t:LdStScalarType> => ast::Type::Scalar(t),
}

LdStQualifier: ast::LdStQualifier = {
    ".weak" => ast::LdStQualifier::Weak,
    ".volatile" => ast::LdStQualifier::Volatile,
    ".relaxed" <s:LdScope> => ast::LdStQualifier::Relaxed(s),
    ".acquire" <s:LdScope> => ast::LdStQualifier::Acquire(s),
};

LdScope: ast::LdScope = {
    ".cta" => ast::LdScope::Cta,
    ".gpu" => ast::LdScope::Gpu,
    ".sys" => ast::LdScope::Sys
};

LdStateSpace: ast::LdStateSpace = {
    ".const" => ast::LdStateSpace::Const,
    ".global" => ast::LdStateSpace::Global,
    ".local" => ast::LdStateSpace::Local,
    ".param" => ast::LdStateSpace::Param,
    ".shared" => ast::LdStateSpace::Shared,
};

LdCacheOperator: ast::LdCacheOperator = {
    ".ca" => ast::LdCacheOperator::Cached,
    ".cg" => ast::LdCacheOperator::L2Only,
    ".cs" => ast::LdCacheOperator::Streaming,
    ".lu" => ast::LdCacheOperator::LastUse,
    ".cv" => ast::LdCacheOperator::Uncached,
};

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov
InstMov: ast::Instruction<ast::ParsedArgParams<'input>> = {
    <m:MovNormal> => ast::Instruction::Mov(m.0, m.1),
    <m:MovVector> => ast::Instruction::Mov(m.0, m.1),
};


MovNormal: (ast::MovDetails, ast::Arg2Mov<ast::ParsedArgParams<'input>>) = {
    "mov" <t:MovScalarType> <dst:ExtendedID> "," <src:Operand> => {(
        ast::MovDetails::new(ast::Type::Scalar(t)),
        ast::Arg2Mov::Normal(ast::Arg2MovNormal{ dst: ast::IdOrVector::Reg(dst), src: src.into() })
    )},
    "mov" <pref:VectorPrefix> <t:MovVectorType> <dst:IdOrVector> "," <src:OperandOrVector> => {(
        ast::MovDetails::new(ast::Type::Vector(t, pref)),
        ast::Arg2Mov::Normal(ast::Arg2MovNormal{ dst: dst, src: src })
    )}
}

MovVector: (ast::MovDetails, ast::Arg2Mov<ast::ParsedArgParams<'input>>) = {
    "mov" <t:MovVectorType> <a:Arg2MovMember> => {(
        ast::MovDetails::new(ast::Type::Scalar(t.into())),
        ast::Arg2Mov::Member(a)
    )},
}

#[inline]
MovScalarType: ast::ScalarType = {
    ".b16" => ast::ScalarType::B16,
    ".b32" => ast::ScalarType::B32,
    ".b64" => ast::ScalarType::B64,
    ".u16" => ast::ScalarType::U16,
    ".u32" => ast::ScalarType::U32,
    ".u64" => ast::ScalarType::U64,
    ".s16" => ast::ScalarType::S16,
    ".s32" => ast::ScalarType::S32,
    ".s64" => ast::ScalarType::S64,
    ".f32" => ast::ScalarType::F32,
    ".f64" => ast::ScalarType::F64,
    ".pred" => ast::ScalarType::Pred
};

#[inline]
MovVectorType: ast::ScalarType = {
    ".b16" => ast::ScalarType::B16,
    ".b32" => ast::ScalarType::B32,
    ".b64" => ast::ScalarType::B64,
    ".u16" => ast::ScalarType::U16,
    ".u32" => ast::ScalarType::U32,
    ".u64" => ast::ScalarType::U64,
    ".s16" => ast::ScalarType::S16,
    ".s32" => ast::ScalarType::S32,
    ".s64" => ast::ScalarType::S64,
    ".f32" => ast::ScalarType::F32,
    ".f64" => ast::ScalarType::F64,
};

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-mul
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mul
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-mul
InstMul: ast::Instruction<ast::ParsedArgParams<'input>> = {
    "mul" <d:InstMulMode> <a:Arg3> => ast::Instruction::Mul(d, a)
};

InstMulMode: ast::MulDetails = {
    <ctr:MulIntControl> <t:IntType> => ast::MulDetails::Int(ast::MulIntDesc {
        typ: t,
        control: ctr
    }),
    <r:RoundingModeFloat?> <ftz:".ftz"?> <s:".sat"?> ".f32" => ast::MulDetails::Float(ast::MulFloatDesc {
        typ: ast::FloatType::F32,
        rounding: r,
        flush_to_zero: ftz.is_some(),
        saturate: s.is_some()
    }),
    <r:RoundingModeFloat?> ".f64" => ast::MulDetails::Float(ast::MulFloatDesc {
        typ: ast::FloatType::F64,
        rounding: r,
        flush_to_zero: false,
        saturate: false
    }),
    <r:".rn"?> <ftz:".ftz"?> <s:".sat"?> ".f16" => ast::MulDetails::Float(ast::MulFloatDesc {
        typ: ast::FloatType::F16,
        rounding: r.map(|_| ast::RoundingMode::NearestEven),
        flush_to_zero: ftz.is_some(),
        saturate: s.is_some()
    }),
    <r:".rn"?> <ftz:".ftz"?> <s:".sat"?> ".f16x2" => ast::MulDetails::Float(ast::MulFloatDesc {
        typ: ast::FloatType::F16x2,
        rounding: r.map(|_| ast::RoundingMode::NearestEven),
        flush_to_zero: ftz.is_some(),
        saturate: s.is_some()
    })
};

MulIntControl: ast::MulIntControl = {
    ".hi" => ast::MulIntControl::High,
    ".lo" => ast::MulIntControl::Low,
    ".wide" => ast::MulIntControl::Wide
};

#[inline]
RoundingModeFloat : ast::RoundingMode = {
    ".rn" => ast::RoundingMode::NearestEven,
    ".rz" => ast::RoundingMode::Zero,
    ".rm" => ast::RoundingMode::NegativeInf,
    ".rp" => ast::RoundingMode::PositiveInf,
};

RoundingModeInt : ast::RoundingMode = {
    ".rni" => ast::RoundingMode::NearestEven,
    ".rzi" => ast::RoundingMode::Zero,
    ".rmi" => ast::RoundingMode::NegativeInf,
    ".rpi" => ast::RoundingMode::PositiveInf,
};

IntType : ast::IntType = {
    ".u16" => ast::IntType::U16,
    ".u32" => ast::IntType::U32,
    ".u64" => ast::IntType::U64,
    ".s16" => ast::IntType::S16,
    ".s32" => ast::IntType::S32,
    ".s64" => ast::IntType::S64,
};

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-add
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-add
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-add
InstAdd: ast::Instruction<ast::ParsedArgParams<'input>> = {
    "add" <d:InstAddMode> <a:Arg3> => ast::Instruction::Add(d, a)
};

InstAddMode: ast::AddDetails = {
    <t:IntType> => ast::AddDetails::Int(ast::AddIntDesc {
        typ: t,
        saturate: false,
    }),
    ".sat" ".s32" => ast::AddDetails::Int(ast::AddIntDesc {
        typ: ast::IntType::S32,
        saturate: true,
    }),
    <rn:RoundingModeFloat?> <ftz:".ftz"?> <sat:".sat"?> ".f32" => ast::AddDetails::Float(ast::AddFloatDesc {
        typ: ast::FloatType::F32,
        rounding: rn,
        flush_to_zero: ftz.is_some(),
        saturate: sat.is_some(),
    }),
    <rn:RoundingModeFloat?> ".f64" => ast::AddDetails::Float(ast::AddFloatDesc {
        typ: ast::FloatType::F64,
        rounding: rn,
        flush_to_zero: false,
        saturate: false,
    }),
    <rn:".rn"?> <ftz:".ftz"?> <sat:".sat"?>".f16" => ast::AddDetails::Float(ast::AddFloatDesc {
        typ: ast::FloatType::F16,
        rounding: rn.map(|_| ast::RoundingMode::NearestEven),
        flush_to_zero: ftz.is_some(),
        saturate: sat.is_some(),
    }),
    ".rn"? ".ftz"? ".sat"? ".f16x2" => todo!()
};

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-setp
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-setp
// TODO: support f16 setp
InstSetp: ast::Instruction<ast::ParsedArgParams<'input>> = {
    "setp" <d:SetpMode> <a:Arg4Setp> => ast::Instruction::Setp(d, a),
    "setp" <d:SetpBoolMode> <a:Arg5> => ast::Instruction::SetpBool(d, a),
};

SetpMode: ast::SetpData = {
    <cmp_op:SetpCompareOp> <ftz:".ftz"?> <t:SetpType> => ast::SetpData{
        typ: t,
        flush_to_zero: ftz.is_some(),
        cmp_op: cmp_op,
    }
};

SetpBoolMode: ast::SetpBoolData = {
    <cmp_op:SetpCompareOp> <bool_op:SetpBoolPostOp> <ftz:".ftz"?> <t:SetpType> => ast::SetpBoolData{
        typ: t,
        flush_to_zero: ftz.is_some(),
        cmp_op: cmp_op,
        bool_op: bool_op,
    }
};

SetpCompareOp: ast::SetpCompareOp = {
    ".eq" => ast::SetpCompareOp::Eq,
    ".ne" => ast::SetpCompareOp::NotEq,
    ".lt" => ast::SetpCompareOp::Less,
    ".le" => ast::SetpCompareOp::LessOrEq,
    ".gt" => ast::SetpCompareOp::Greater,
    ".ge" => ast::SetpCompareOp::GreaterOrEq,
    ".lo" => ast::SetpCompareOp::Less,
    ".ls" => ast::SetpCompareOp::LessOrEq,
    ".hi" => ast::SetpCompareOp::Greater,
    ".hs" => ast::SetpCompareOp::GreaterOrEq,
    ".equ" => ast::SetpCompareOp::NanEq,
    ".neu" => ast::SetpCompareOp::NanNotEq,
    ".ltu" => ast::SetpCompareOp::NanLess,
    ".leu" => ast::SetpCompareOp::NanLessOrEq,
    ".gtu" => ast::SetpCompareOp::NanGreater,
    ".geu" => ast::SetpCompareOp::NanGreaterOrEq,
    ".num" => ast::SetpCompareOp::IsNotNan,
    ".nan" => ast::SetpCompareOp::IsNan,
};

SetpBoolPostOp: ast::SetpBoolPostOp = {
    ".and" => ast::SetpBoolPostOp::And,
    ".or" => ast::SetpBoolPostOp::Or,
    ".xor" => ast::SetpBoolPostOp::Xor,
};

SetpType: ast::ScalarType = {
    ".b16" => ast::ScalarType::B16,
    ".b32" => ast::ScalarType::B32,
    ".b64" => ast::ScalarType::B64,
    ".u16" => ast::ScalarType::U16,
    ".u32" => ast::ScalarType::U32,
    ".u64" => ast::ScalarType::U64,
    ".s16" => ast::ScalarType::S16,
    ".s32" => ast::ScalarType::S32,
    ".s64" => ast::ScalarType::S64,
    ".f32" => ast::ScalarType::F32,
    ".f64" => ast::ScalarType::F64,
};

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-not
InstNot: ast::Instruction<ast::ParsedArgParams<'input>> = {
    "not" <t:NotType> <a:Arg2> => ast::Instruction::Not(t, a)
};

NotType: ast::NotType = {
    ".pred" => ast::NotType::Pred,
    ".b16" => ast::NotType::B16,
    ".b32" => ast::NotType::B32,
    ".b64" => ast::NotType::B64,
};

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-at
PredAt: ast::PredAt<&'input str> = {
    "@" <label:ExtendedID> => ast::PredAt { not: false, label:label },
    "@" "!" <label:ExtendedID> => ast::PredAt { not: true, label:label }
};

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-bra
InstBra: ast::Instruction<ast::ParsedArgParams<'input>> = {
    "bra" <u:".uni"?> <a:Arg1> => ast::Instruction::Bra(ast::BraData{ uniform: u.is_some() }, a)
};

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt
InstCvt: ast::Instruction<ast::ParsedArgParams<'input>> = {
    "cvt" <s:".sat"?> <dst_t:CvtTypeInt> <src_t:CvtTypeInt> <a:Arg2> => {
        ast::Instruction::Cvt(ast::CvtDetails::new_int_from_int_checked(
            s.is_some(),
            dst_t,
            src_t,
            errors
        ),
        a)
    },
    "cvt" <r:RoundingModeFloat> <f:".ftz"?> <s:".sat"?> <dst_t:CvtTypeFloat> <src_t:CvtTypeInt> <a:Arg2> => {
        ast::Instruction::Cvt(ast::CvtDetails::new_float_from_int_checked(
            r,
            f.is_some(),
            s.is_some(),
            dst_t,
            src_t,
            errors
        ),
        a)
    },
    "cvt" <r:RoundingModeInt> <f:".ftz"?> <s:".sat"?> <dst_t:CvtTypeInt> <src_t:CvtTypeFloat> <a:Arg2> => {
        ast::Instruction::Cvt(ast::CvtDetails::new_int_from_float_checked(
            r,
            f.is_some(),
            s.is_some(),
            dst_t,
            src_t,
            errors
        ),
        a)
    },
    "cvt" <r:RoundingModeInt?> <s:".sat"?> ".f16" ".f16" <a:Arg2> => {
        ast::Instruction::Cvt(ast::CvtDetails::FloatFromFloat(
            ast::CvtDesc {
                rounding: r,
                flush_to_zero: false,
                saturate: s.is_some(),
                dst: ast::FloatType::F16,
                src: ast::FloatType::F16
            }
        ), a)
    },
    "cvt" <f:".ftz"?> <s:".sat"?> ".f32" ".f16" <a:Arg2> => {
        ast::Instruction::Cvt(ast::CvtDetails::FloatFromFloat(
            ast::CvtDesc {
                rounding: None,
                flush_to_zero: f.is_some(),
                saturate: s.is_some(),
                dst: ast::FloatType::F32,
                src: ast::FloatType::F16
            }
        ), a)
    },
    "cvt" <s:".sat"?> ".f64" ".f16" <a:Arg2> => {
        ast::Instruction::Cvt(ast::CvtDetails::FloatFromFloat(
            ast::CvtDesc {
                rounding: None,
                flush_to_zero: false,
                saturate: s.is_some(),
                dst: ast::FloatType::F64,
                src: ast::FloatType::F16
            }
        ), a)
    },
    "cvt" <r:RoundingModeFloat> <f:".ftz"?> <s:".sat"?> ".f16" ".f32" <a:Arg2> => {
        ast::Instruction::Cvt(ast::CvtDetails::FloatFromFloat(
            ast::CvtDesc {
                rounding: Some(r),
                flush_to_zero: f.is_some(),
                saturate: s.is_some(),
                dst: ast::FloatType::F16,
                src: ast::FloatType::F32
            }
        ), a)
    },
    "cvt" <r:RoundingModeFloat?> <f:".ftz"?> <s:".sat"?> ".f32" ".f32" <a:Arg2> => {
        ast::Instruction::Cvt(ast::CvtDetails::FloatFromFloat(
            ast::CvtDesc {
                rounding: r,
                flush_to_zero: f.is_some(),
                saturate: s.is_some(),
                dst: ast::FloatType::F32,
                src: ast::FloatType::F32
            }
        ), a)
    },
    "cvt" <s:".sat"?> ".f64" ".f32" <a:Arg2> => {
        ast::Instruction::Cvt(ast::CvtDetails::FloatFromFloat(
            ast::CvtDesc {
                rounding: None,
                flush_to_zero: false,
                saturate: s.is_some(),
                dst: ast::FloatType::F64,
                src: ast::FloatType::F32
            }
        ), a)
    },
    "cvt" <r:RoundingModeFloat> <s:".sat"?> ".f16" ".f64" <a:Arg2> => {
        ast::Instruction::Cvt(ast::CvtDetails::FloatFromFloat(
            ast::CvtDesc {
                rounding: Some(r),
                flush_to_zero: false,
                saturate: s.is_some(),
                dst: ast::FloatType::F16,
                src: ast::FloatType::F64
            }
        ), a)
    },
    "cvt" <r:RoundingModeFloat> <f:".ftz"?> <s:".sat"?> ".f32" ".f64" <a:Arg2> => {
        ast::Instruction::Cvt(ast::CvtDetails::FloatFromFloat(
            ast::CvtDesc {
                rounding: Some(r),
                flush_to_zero: s.is_some(),
                saturate: s.is_some(),
                dst: ast::FloatType::F32,
                src: ast::FloatType::F64
            }
        ), a)
    },
    "cvt" <r:RoundingModeFloat?> <s:".sat"?> ".f64" ".f64" <a:Arg2> => {
        ast::Instruction::Cvt(ast::CvtDetails::FloatFromFloat(
            ast::CvtDesc {
                rounding: r,
                flush_to_zero: false,
                saturate: s.is_some(),
                dst: ast::FloatType::F64,
                src: ast::FloatType::F64
            }
        ), a)
    },
};

CvtTypeInt: ast::IntType = {
    ".u8" => ast::IntType::U8,
    ".u16" => ast::IntType::U16,
    ".u32" => ast::IntType::U32,
    ".u64" => ast::IntType::U64,
    ".s8" => ast::IntType::S8,
    ".s16" => ast::IntType::S16,
    ".s32" => ast::IntType::S32,
    ".s64" => ast::IntType::S64,
};

CvtTypeFloat: ast::FloatType = {
    ".f16" => ast::FloatType::F16,
    ".f32" => ast::FloatType::F32,
    ".f64" => ast::FloatType::F64,
};

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shl
InstShl: ast::Instruction<ast::ParsedArgParams<'input>> = {
    "shl" <t:ShlType> <a:Arg3> => ast::Instruction::Shl(t, a)
};

ShlType: ast::ShlType = {
    ".b16" => ast::ShlType::B16,
    ".b32" => ast::ShlType::B32,
    ".b64" => ast::ShlType::B64,
};

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st
// Warning: NVIDIA documentation is incorrect, you can specify scope only once
InstSt: ast::Instruction<ast::ParsedArgParams<'input>> = {
    "st" <q:LdStQualifier?> <ss:StStateSpace?> <cop:StCacheOperator?> <t:LdStType> <src1:MemoryOperand> "," <src2:OperandOrVector> => {
        ast::Instruction::St(
            ast::StData {
                qualifier: q.unwrap_or(ast::LdStQualifier::Weak),
                state_space: ss.unwrap_or(ast::StStateSpace::Generic),
                caching: cop.unwrap_or(ast::StCacheOperator::Writeback),
                typ: t
            },
            ast::Arg2St { src1:src1, src2:src2 }
        )
    }
};

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#using-addresses-arrays-and-vectors
MemoryOperand: ast::Operand<&'input str> = {
    "[" <o:Operand> "]" => o
}

StStateSpace: ast::StStateSpace = {
    ".global" => ast::StStateSpace::Global,
    ".local" => ast::StStateSpace::Local,
    ".param" => ast::StStateSpace::Param,
    ".shared" => ast::StStateSpace::Shared,
};

StCacheOperator: ast::StCacheOperator = {
    ".wb" => ast::StCacheOperator::Writeback,
    ".cg" => ast::StCacheOperator::L2Only,
    ".cs" => ast::StCacheOperator::Streaming,
    ".wt" => ast::StCacheOperator::Writethrough,
};

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret
InstRet: ast::Instruction<ast::ParsedArgParams<'input>> = {
    "ret" <u:".uni"?> => ast::Instruction::Ret(ast::RetData { uniform: u.is_some() })
};

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta
InstCvta: ast::Instruction<ast::ParsedArgParams<'input>> = {
    "cvta" <to:CvtaStateSpace> <s:CvtaSize> <a:Arg2> => {
        ast::Instruction::Cvta(ast::CvtaDetails {
            to: to,
            from: ast::CvtaStateSpace::Generic,
            size: s
        },
        a)
    },
    "cvta" ".to" <from:CvtaStateSpace> <s:CvtaSize> <a:Arg2> => {
        ast::Instruction::Cvta(ast::CvtaDetails {
            to: ast::CvtaStateSpace::Generic,
            from: from,
            size: s
        },
        a)
    }
}

CvtaStateSpace: ast::CvtaStateSpace = {
    ".const" => ast::CvtaStateSpace::Const,
    ".global" => ast::CvtaStateSpace::Global,
    ".local" => ast::CvtaStateSpace::Local,
    ".shared" => ast::CvtaStateSpace::Shared,
}

CvtaSize: ast::CvtaSize = {
    ".u32" => ast::CvtaSize::U32,
    ".u64" => ast::CvtaSize::U64,
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-call
InstCall: ast::Instruction<ast::ParsedArgParams<'input>> = {
    "call" <u:".uni"?> <args:ArgCall> => {
        let (ret_params, func, param_list) = args;
        ast::Instruction::Call(ast::CallInst { uniform: u.is_some(), ret_params, func, param_list })
    }
};

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-abs
InstAbs: ast::Instruction<ast::ParsedArgParams<'input>> = {
    "abs" <t:SignedIntType> <a:Arg2> => {
        ast::Instruction::Abs(ast::AbsDetails { flush_to_zero: false, typ: t }, a)
    },
    "abs" <f:".ftz"?> ".f32" <a:Arg2> => {
        ast::Instruction::Abs(ast::AbsDetails { flush_to_zero: f.is_some(), typ: ast::ScalarType::F32 }, a)
    },
    "abs" ".f64" <a:Arg2> => {
        ast::Instruction::Abs(ast::AbsDetails { flush_to_zero: false, typ: ast::ScalarType::F64 }, a)
    },
    "abs" <f:".ftz"?> ".f16" <a:Arg2> => {
        ast::Instruction::Abs(ast::AbsDetails { flush_to_zero: f.is_some(), typ: ast::ScalarType::F16 }, a)
    },
    "abs" <f:".ftz"?> ".f16x2" <a:Arg2> => {
        todo!()
    },
};

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-mad
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mad
InstMad: ast::Instruction<ast::ParsedArgParams<'input>> = {
    "mad" <d:InstMulMode> <a:Arg4> => ast::Instruction::Mad(d, a),
    "mad" ".hi" ".sat" ".s32" => todo!()
};

SignedIntType: ast::ScalarType = {
    ".s16" => ast::ScalarType::S16,
    ".s32" => ast::ScalarType::S32,
    ".s64" => ast::ScalarType::S64,
};

Operand: ast::Operand<&'input str> = {
    <r:ExtendedID> => ast::Operand::Reg(r),
    <r:ExtendedID> "+" <o:Num> => {
        let offset = o.parse::<i32>();
        let offset = offset.unwrap_with(errors);
        ast::Operand::RegOffset(r, offset)
    },
    // TODO: start parsing whole constants sub-language:
    //       https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#constants
    <o:Num> => {
        let offset = o.parse::<u32>();
        let offset = offset.unwrap_with(errors);
        ast::Operand::Imm(offset)
    }
};

CallOperand: ast::CallOperand<&'input str> = {
    <r:ExtendedID> => ast::CallOperand::Reg(r),
    <o:Num> => {
        let offset = o.parse::<u32>();
        let offset = offset.unwrap_with(errors);
        ast::CallOperand::Imm(offset)
    }
};

Arg1: ast::Arg1<ast::ParsedArgParams<'input>> = {
    <src:ExtendedID> => ast::Arg1{<>}
};

Arg2: ast::Arg2<ast::ParsedArgParams<'input>> = {
    <dst:ExtendedID> "," <src:Operand> => ast::Arg2{<>}
};

Arg2MovMember: ast::Arg2MovMember<ast::ParsedArgParams<'input>> = {
    <dst:MemberOperand> "," <src:ExtendedID> => ast::Arg2MovMember::Dst(dst, dst.0, src),
    <dst:ExtendedID> "," <src:MemberOperand> => ast::Arg2MovMember::Src(dst, src),
    <dst:MemberOperand> "," <src:MemberOperand> => ast::Arg2MovMember::Both(dst, dst.0, src),
};

MemberOperand: (&'input str, u8) = {
    <pref:ExtendedID> "." <suf:ExtendedID> =>? {
        let suf_idx = vector_index(suf)?;
        Ok((pref, suf_idx))
    },
    <pref:ExtendedID> <suf:DotID>  =>? {
        let suf_idx = vector_index(&suf[1..])?;
        Ok((pref, suf_idx))
    }
};

VectorExtract: Vec<&'input str> = {
    "{" <r1:ExtendedID> "," <r2:ExtendedID> "}" => {
        vec![r1, r2]
    },
    "{" <r1:ExtendedID> "," <r2:ExtendedID> "," <r3:ExtendedID> "," <r4:ExtendedID> "}" => {
        vec![r1, r2, r3, r4]
    },
};

Arg3: ast::Arg3<ast::ParsedArgParams<'input>> = {
    <dst:ExtendedID> "," <src1:Operand> "," <src2:Operand> => ast::Arg3{<>}
};

Arg4: ast::Arg4<ast::ParsedArgParams<'input>> = {
    <dst:ExtendedID> "," <src1:Operand> "," <src2:Operand> ","  <src3:Operand> => ast::Arg4{<>}
};

Arg4Setp: ast::Arg4Setp<ast::ParsedArgParams<'input>> = {
    <dst1:ExtendedID> <dst2:OptionalDst?> "," <src1:Operand> "," <src2:Operand> => ast::Arg4Setp{<>}
};

// TODO: pass src3 negation somewhere
Arg5: ast::Arg5<ast::ParsedArgParams<'input>> = {
    <dst1:ExtendedID> <dst2:OptionalDst?> "," <src1:Operand> "," <src2:Operand> "," "!"? <src3:Operand> => ast::Arg5{<>}
};

ArgCall: (Vec<&'input str>, &'input str, Vec<ast::CallOperand<&'input str>>) = {
    "(" <ret_params:Comma<ExtendedID>> ")" "," <func:ExtendedID> "," "(" <param_list:Comma<CallOperand>> ")" => {
        (ret_params, func, param_list)
    },
    <func:ExtendedID> "," "(" <param_list:Comma<CallOperand>> ")" => (Vec::new(), func, param_list),
    <func:ExtendedID> => (Vec::new(), func, Vec::<ast::CallOperand<_>>::new()),
};

OptionalDst: &'input str = {
    "|" <dst2:ExtendedID> => dst2
}

VectorPrefix: u8 = {
    ".v2" => 2,
    ".v4" => 4
};

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#debugging-directives-file
File = {
    ".file" Num String ("," Num "," Num)?
};

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#debugging-directives-section
Section = {
    ".section" DotID "{" SectionDwarfLines* "}"
};

SectionDwarfLines: () = {
    BitType Comma<Num>,
    ".b32" SectionLabel,
    ".b64" SectionLabel,
    ".b32" SectionLabel "+" Num,
    ".b64" SectionLabel "+" Num,
};

SectionLabel = {
    ID,
    DotID
};

BitType = {
    ".b8", ".b16", ".b32", ".b64"
};

Comma<T>: Vec<T> = {
    <v:(<T> ",")*> <e:T?> => match e {
        None => v,
        Some(e) => {
            let mut v = v;
            v.push(e);
            v
        }
    }
};