mirror of
https://github.com/vosen/ZLUDA.git
synced 2025-04-12 10:48:53 +03:00
Support instruction modes (denormal and rounding) on AMD GPUs (#342)
This commit is contained in:
175
Cargo.lock
generated
175
Cargo.lock
generated
@ -52,6 +52,12 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
|
||||
|
||||
[[package]]
|
||||
name = "beef"
|
||||
version = "0.5.2"
|
||||
@ -328,6 +334,12 @@ version = "1.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
|
||||
|
||||
[[package]]
|
||||
name = "equivalent"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
|
||||
|
||||
[[package]]
|
||||
name = "errno"
|
||||
version = "0.3.9"
|
||||
@ -344,6 +356,12 @@ version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6"
|
||||
|
||||
[[package]]
|
||||
name = "fixedbitset"
|
||||
version = "0.5.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99"
|
||||
|
||||
[[package]]
|
||||
name = "fnv"
|
||||
version = "1.0.7"
|
||||
@ -373,6 +391,12 @@ version = "1.8.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1b43ede17f21864e81be2fa654110bf1e793774238d86ef8555c37e6519c0403"
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.15.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.5.0"
|
||||
@ -383,6 +407,29 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
||||
name = "hip_runtime-sys"
|
||||
version = "0.0.0"
|
||||
|
||||
[[package]]
|
||||
name = "indexmap"
|
||||
version = "2.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652"
|
||||
dependencies = [
|
||||
"equivalent",
|
||||
"hashbrown",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "int-enum"
|
||||
version = "1.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a37a9c11c6ecfec8b9bed97337dfecff3686d02ba8f52e8addad2829d047128"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"proc-macro2-diagnostics",
|
||||
"quote",
|
||||
"syn 2.0.89",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itertools"
|
||||
version = "0.13.0"
|
||||
@ -499,6 +546,16 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matrixmultiply"
|
||||
version = "0.3.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9380b911e3e96d10c1f415da0876389aaf1b56759054eeb0de7df940c456ba1a"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"rawpointer",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.7.4"
|
||||
@ -514,12 +571,37 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "microlp"
|
||||
version = "0.2.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "edaa5264bc1f7668bc12e10757f8f529a526656c796cc2106cf2be10c5b8d483"
|
||||
dependencies = [
|
||||
"log",
|
||||
"sprs",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "minimal-lexical"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
|
||||
|
||||
[[package]]
|
||||
name = "ndarray"
|
||||
version = "0.16.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841"
|
||||
dependencies = [
|
||||
"matrixmultiply",
|
||||
"num-complex",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
"portable-atomic",
|
||||
"portable-atomic-util",
|
||||
"rawpointer",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nom"
|
||||
version = "7.1.3"
|
||||
@ -530,6 +612,33 @@ dependencies = [
|
||||
"minimal-lexical",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-complex"
|
||||
version = "0.4.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-integer"
|
||||
version = "0.1.46"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-traits"
|
||||
version = "0.2.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num_enum"
|
||||
version = "0.4.3"
|
||||
@ -567,6 +676,16 @@ version = "1.0.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
|
||||
|
||||
[[package]]
|
||||
name = "petgraph"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772"
|
||||
dependencies = [
|
||||
"fixedbitset",
|
||||
"indexmap",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "plain"
|
||||
version = "0.2.3"
|
||||
@ -579,6 +698,15 @@ version = "1.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2"
|
||||
|
||||
[[package]]
|
||||
name = "portable-atomic-util"
|
||||
version = "0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507"
|
||||
dependencies = [
|
||||
"portable-atomic",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pretty_assertions"
|
||||
version = "1.4.1"
|
||||
@ -647,6 +775,19 @@ dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2-diagnostics"
|
||||
version = "0.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.89",
|
||||
"version_check",
|
||||
"yansi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ptx"
|
||||
version = "0.0.0"
|
||||
@ -657,8 +798,11 @@ dependencies = [
|
||||
"cuda-driver-sys",
|
||||
"half",
|
||||
"hip_runtime-sys",
|
||||
"int-enum",
|
||||
"llvm_zluda",
|
||||
"microlp",
|
||||
"paste",
|
||||
"petgraph",
|
||||
"pretty_assertions",
|
||||
"ptx_parser",
|
||||
"quick-error",
|
||||
@ -667,6 +811,7 @@ dependencies = [
|
||||
"strum_macros",
|
||||
"tempfile",
|
||||
"thiserror 1.0.64",
|
||||
"unwrap_or",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -720,6 +865,12 @@ dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rawpointer"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.11.0"
|
||||
@ -859,6 +1010,24 @@ version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
||||
|
||||
[[package]]
|
||||
name = "smallvec"
|
||||
version = "1.13.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
|
||||
|
||||
[[package]]
|
||||
name = "sprs"
|
||||
version = "0.11.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8bff8419009a08f6cb7519a602c5590241fbff1446bcc823c07af15386eb801b"
|
||||
dependencies = [
|
||||
"ndarray",
|
||||
"num-complex",
|
||||
"num-traits",
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "strum"
|
||||
version = "0.26.3"
|
||||
@ -980,6 +1149,12 @@ version = "0.2.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
|
||||
|
||||
[[package]]
|
||||
name = "unwrap_or"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8f2fe1f049979528ce97d8a4672f984f8846fc9975be0cf14ca798116d724c4a"
|
||||
|
||||
[[package]]
|
||||
name = "version_check"
|
||||
version = "0.9.5"
|
||||
|
@ -133,21 +133,26 @@ pub fn compile_bitcode(
|
||||
&linking_info,
|
||||
amd_comgr_action_kind_t::AMD_COMGR_ACTION_LINK_BC_TO_BC,
|
||||
)?;
|
||||
let link_with_device_libs_info = ActionInfo::new()?;
|
||||
link_with_device_libs_info.set_isa_name(gcn_arch)?;
|
||||
link_with_device_libs_info.set_language(amd_comgr_language_t::AMD_COMGR_LANGUAGE_LLVM_IR)?;
|
||||
// This makes no sense, but it makes ockl linking work
|
||||
link_with_device_libs_info
|
||||
.set_options([c"-Xclang", c"-mno-link-builtin-bitcode-postopt"].into_iter())?;
|
||||
let with_device_libs = do_action(
|
||||
&linked_data_set,
|
||||
&link_with_device_libs_info,
|
||||
amd_comgr_action_kind_t::AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC,
|
||||
)?;
|
||||
let compile_action_info = ActionInfo::new()?;
|
||||
compile_action_info.set_isa_name(gcn_arch)?;
|
||||
let common_options = [c"-O3", c"-mno-wavefrontsize64", c"-mcumode"].into_iter();
|
||||
let compile_to_exec = ActionInfo::new()?;
|
||||
compile_to_exec.set_isa_name(gcn_arch)?;
|
||||
compile_to_exec.set_language(amd_comgr_language_t::AMD_COMGR_LANGUAGE_LLVM_IR)?;
|
||||
let common_options = [
|
||||
// This makes no sense, but it makes ockl linking work
|
||||
c"-Xclang",
|
||||
c"-mno-link-builtin-bitcode-postopt",
|
||||
// Otherwise LLVM omits dynamic fp mode for ockl functions during linking
|
||||
// and then fails to inline them
|
||||
c"-Xclang",
|
||||
c"-fdenormal-fp-math=dynamic",
|
||||
c"-O3",
|
||||
c"-mno-wavefrontsize64",
|
||||
c"-mcumode",
|
||||
// Useful for inlining reports, combined with AMD_COMGR_SAVE_TEMPS=1 AMD_COMGR_EMIT_VERBOSE_LOGS=1 AMD_COMGR_REDIRECT_LOGS=stderr
|
||||
// c"-fsave-optimization-record=yaml",
|
||||
]
|
||||
.into_iter();
|
||||
let opt_options = if cfg!(debug_assertions) {
|
||||
//[c"-g", c"-mllvm", c"-print-before-all", c"", c""]
|
||||
[c"-g", c"", c"", c"", c""]
|
||||
} else {
|
||||
[
|
||||
@ -159,19 +164,14 @@ pub fn compile_bitcode(
|
||||
c"-inlinehint-threshold=3250",
|
||||
]
|
||||
};
|
||||
compile_action_info.set_options(common_options.chain(opt_options))?;
|
||||
let reloc_data_set = do_action(
|
||||
&with_device_libs,
|
||||
&compile_action_info,
|
||||
amd_comgr_action_kind_t::AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE,
|
||||
)?;
|
||||
compile_to_exec.set_options(common_options.chain(opt_options))?;
|
||||
let exec_data_set = do_action(
|
||||
&reloc_data_set,
|
||||
&compile_action_info,
|
||||
amd_comgr_action_kind_t::AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE,
|
||||
&linked_data_set,
|
||||
&compile_to_exec,
|
||||
amd_comgr_action_kind_t::AMD_COMGR_ACTION_COMPILE_SOURCE_TO_EXECUTABLE,
|
||||
)?;
|
||||
let executable =
|
||||
exec_data_set.get_data(amd_comgr_data_kind_t::AMD_COMGR_DATA_KIND_EXECUTABLE, 0)?;
|
||||
exec_data_set.get_data(amd_comgr_data_kind_t::AMD_COMGR_DATA_KIND_EXECUTABLE, 0)?;
|
||||
executable.copy_content()
|
||||
}
|
||||
|
||||
|
@ -17,6 +17,10 @@ bitflags = "1.2"
|
||||
rustc-hash = "2.0.0"
|
||||
strum = "0.26"
|
||||
strum_macros = "0.26"
|
||||
petgraph = "0.7.1"
|
||||
microlp = "0.2.10"
|
||||
int-enum = "1.1"
|
||||
unwrap_or = "1.0.1"
|
||||
|
||||
[dev-dependencies]
|
||||
hip_runtime-sys = { path = "../ext/hip_runtime-sys" }
|
||||
|
@ -2,8 +2,8 @@ use super::*;
|
||||
|
||||
pub(super) fn run<'a, 'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
directives
|
||||
.into_iter()
|
||||
.map(|directive| run_directive(resolver, directive))
|
||||
@ -12,8 +12,8 @@ pub(super) fn run<'a, 'input>(
|
||||
|
||||
fn run_directive<'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2,
|
||||
directive: Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
directive: Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
Ok(match directive {
|
||||
var @ Directive2::Variable(..) => var,
|
||||
Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?),
|
||||
@ -22,13 +22,13 @@ fn run_directive<'input>(
|
||||
|
||||
fn run_method<'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2,
|
||||
mut method: Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
mut method: Function2<ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Function2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
let is_declaration = method.body.is_none();
|
||||
let mut body = Vec::new();
|
||||
let mut remap_returns = Vec::new();
|
||||
if !method.func_decl.name.is_kernel() {
|
||||
for arg in method.func_decl.return_arguments.iter_mut() {
|
||||
if !method.is_kernel {
|
||||
for arg in method.return_arguments.iter_mut() {
|
||||
match arg.state_space {
|
||||
ptx_parser::StateSpace::Param => {
|
||||
arg.state_space = ptx_parser::StateSpace::Reg;
|
||||
@ -51,7 +51,7 @@ fn run_method<'input>(
|
||||
_ => return Err(error_unreachable()),
|
||||
}
|
||||
}
|
||||
for arg in method.func_decl.input_arguments.iter_mut() {
|
||||
for arg in method.input_arguments.iter_mut() {
|
||||
match arg.state_space {
|
||||
ptx_parser::StateSpace::Param => {
|
||||
arg.state_space = ptx_parser::StateSpace::Reg;
|
||||
@ -95,14 +95,7 @@ fn run_method<'input>(
|
||||
Ok::<_, TranslateError>(body)
|
||||
})
|
||||
.transpose()?;
|
||||
Ok(Function2 {
|
||||
func_decl: method.func_decl,
|
||||
globals: method.globals,
|
||||
body,
|
||||
import_as: method.import_as,
|
||||
tuning: method.tuning,
|
||||
linkage: method.linkage,
|
||||
})
|
||||
Ok(Function2 { body, ..method })
|
||||
}
|
||||
|
||||
fn run_statement<'input>(
|
||||
|
@ -69,7 +69,10 @@ pub struct Module(LLVMModuleRef, Context);
|
||||
|
||||
impl Module {
|
||||
fn new(ctx: Context, name: &CStr) -> Self {
|
||||
Self(unsafe { LLVMModuleCreateWithNameInContext(name.as_ptr(), ctx.get()) }, ctx)
|
||||
Self(
|
||||
unsafe { LLVMModuleCreateWithNameInContext(name.as_ptr(), ctx.get()) },
|
||||
ctx,
|
||||
)
|
||||
}
|
||||
|
||||
fn get(&self) -> LLVMModuleRef {
|
||||
@ -183,9 +186,10 @@ impl Deref for MemoryBuffer {
|
||||
|
||||
pub(super) fn run<'input>(
|
||||
id_defs: GlobalStringIdentResolver2<'input>,
|
||||
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Module, TranslateError> {
|
||||
let module = Module::new(Context::new(), LLVM_UNNAMED);
|
||||
let context = Context::new();
|
||||
let module = Module::new(context, LLVM_UNNAMED);
|
||||
let mut emit_ctx = ModuleEmitContext::new(&module, &id_defs);
|
||||
for directive in directives {
|
||||
match directive {
|
||||
@ -208,10 +212,7 @@ struct ModuleEmitContext<'a, 'input> {
|
||||
}
|
||||
|
||||
impl<'a, 'input> ModuleEmitContext<'a, 'input> {
|
||||
fn new(
|
||||
module: &Module,
|
||||
id_defs: &'a GlobalStringIdentResolver2<'input>,
|
||||
) -> Self {
|
||||
fn new(module: &Module, id_defs: &'a GlobalStringIdentResolver2<'input>) -> Self {
|
||||
let context = module.context();
|
||||
ModuleEmitContext {
|
||||
context: context.get(),
|
||||
@ -232,24 +233,20 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
|
||||
|
||||
fn emit_method(
|
||||
&mut self,
|
||||
method: Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
method: Function2<ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<(), TranslateError> {
|
||||
let func_decl = method.func_decl;
|
||||
let name = method
|
||||
.import_as
|
||||
.as_deref()
|
||||
.or_else(|| match func_decl.name {
|
||||
ast::MethodName::Kernel(name) => Some(name),
|
||||
ast::MethodName::Func(id) => self.id_defs.ident_map[&id].name.as_deref(),
|
||||
})
|
||||
.or_else(|| self.id_defs.ident_map[&method.name].name.as_deref())
|
||||
.ok_or_else(|| error_unreachable())?;
|
||||
let name = CString::new(name).map_err(|_| error_unreachable())?;
|
||||
let mut fn_ = unsafe { LLVMGetNamedFunction(self.module, name.as_ptr()) };
|
||||
if fn_ == ptr::null_mut() {
|
||||
let fn_type = get_function_type(
|
||||
self.context,
|
||||
func_decl.return_arguments.iter().map(|v| &v.v_type),
|
||||
func_decl
|
||||
method.return_arguments.iter().map(|v| &v.v_type),
|
||||
method
|
||||
.input_arguments
|
||||
.iter()
|
||||
.map(|v| get_input_argument_type(self.context, &v.v_type, v.state_space)),
|
||||
@ -259,15 +256,28 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
|
||||
self.emit_fn_attribute(fn_, "uniform-work-group-size", "true");
|
||||
self.emit_fn_attribute(fn_, "no-trapping-math", "true");
|
||||
}
|
||||
if let ast::MethodName::Func(name) = func_decl.name {
|
||||
self.resolver.register(name, fn_);
|
||||
if !method.is_kernel {
|
||||
self.resolver.register(method.name, fn_);
|
||||
self.emit_fn_attribute(fn_, "denormal-fp-math-f32", "dynamic");
|
||||
self.emit_fn_attribute(fn_, "denormal-fp-math", "dynamic");
|
||||
} else {
|
||||
self.emit_fn_attribute(
|
||||
fn_,
|
||||
"denormal-fp-math-f32",
|
||||
llvm_ftz(method.flush_to_zero_f32),
|
||||
);
|
||||
self.emit_fn_attribute(
|
||||
fn_,
|
||||
"denormal-fp-math",
|
||||
llvm_ftz(method.flush_to_zero_f16f64),
|
||||
);
|
||||
}
|
||||
for (i, param) in func_decl.input_arguments.iter().enumerate() {
|
||||
for (i, param) in method.input_arguments.iter().enumerate() {
|
||||
let value = unsafe { LLVMGetParam(fn_, i as u32) };
|
||||
let name = self.resolver.get_or_add(param.name);
|
||||
unsafe { LLVMSetValueName2(value, name.as_ptr().cast(), name.len()) };
|
||||
self.resolver.register(param.name, value);
|
||||
if func_decl.name.is_kernel() {
|
||||
if method.is_kernel {
|
||||
let attr_kind = unsafe {
|
||||
LLVMGetEnumAttributeKindForName(b"byref".as_ptr().cast(), b"byref".len())
|
||||
};
|
||||
@ -281,7 +291,7 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
|
||||
unsafe { LLVMAddAttributeAtIndex(fn_, i as u32 + 1, attr) };
|
||||
}
|
||||
}
|
||||
let call_conv = if func_decl.name.is_kernel() {
|
||||
let call_conv = if method.is_kernel {
|
||||
Self::kernel_call_convention()
|
||||
} else {
|
||||
Self::func_call_convention()
|
||||
@ -296,7 +306,7 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
|
||||
unsafe { LLVMAppendBasicBlockInContext(self.context, fn_, LLVM_UNNAMED.as_ptr()) };
|
||||
unsafe { LLVMPositionBuilderAtEnd(self.builder.get(), real_bb) };
|
||||
let mut method_emitter = MethodEmitContext::new(self, fn_, variables_builder);
|
||||
for var in func_decl.return_arguments {
|
||||
for var in method.return_arguments {
|
||||
method_emitter.emit_variable(var)?;
|
||||
}
|
||||
for statement in statements.iter() {
|
||||
@ -304,6 +314,17 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
|
||||
method_emitter.emit_label_initial(*label);
|
||||
}
|
||||
}
|
||||
let mut statements = statements.into_iter();
|
||||
if let Some(Statement::Label(label)) = statements.next() {
|
||||
method_emitter.emit_label_delayed(label)?;
|
||||
} else {
|
||||
return Err(error_unreachable());
|
||||
}
|
||||
method_emitter.emit_kernel_rounding_prelude(
|
||||
method.is_kernel,
|
||||
method.rounding_mode_f32,
|
||||
method.rounding_mode_f16f64,
|
||||
)?;
|
||||
for statement in statements {
|
||||
method_emitter.emit_statement(statement)?;
|
||||
}
|
||||
@ -431,6 +452,14 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
|
||||
}
|
||||
}
|
||||
|
||||
fn llvm_ftz(ftz: bool) -> &'static str {
|
||||
if ftz {
|
||||
"preserve-sign"
|
||||
} else {
|
||||
"ieee"
|
||||
}
|
||||
}
|
||||
|
||||
fn get_input_argument_type(
|
||||
context: LLVMContextRef,
|
||||
v_type: &ast::Type,
|
||||
@ -487,9 +516,32 @@ impl<'a> MethodEmitContext<'a> {
|
||||
Statement::FunctionPointer(_) => todo!(),
|
||||
Statement::VectorRead(vector_read) => self.emit_vector_read(vector_read)?,
|
||||
Statement::VectorWrite(vector_write) => self.emit_vector_write(vector_write)?,
|
||||
Statement::SetMode(mode_reg) => self.emit_set_mode(mode_reg)?,
|
||||
})
|
||||
}
|
||||
|
||||
// This should be a kernel attribute, but sadly AMDGPU LLVM target does
|
||||
// not support attribute for it. So we have to set it as the first
|
||||
// instruction in the body of a kernel
|
||||
fn emit_kernel_rounding_prelude(
|
||||
&mut self,
|
||||
is_kernel: bool,
|
||||
rounding_mode_f32: ast::RoundingMode,
|
||||
rounding_mode_f16f64: ast::RoundingMode,
|
||||
) -> Result<(), TranslateError> {
|
||||
if is_kernel {
|
||||
if rounding_mode_f32 != ast::RoundingMode::NearestEven
|
||||
|| rounding_mode_f16f64 != ast::RoundingMode::NearestEven
|
||||
{
|
||||
self.emit_set_mode(ModeRegister::Rounding {
|
||||
f32: rounding_mode_f32,
|
||||
f16f64: rounding_mode_f16f64,
|
||||
})?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn emit_variable(&mut self, var: ast::Variable<SpirvWord>) -> Result<(), TranslateError> {
|
||||
let alloca = unsafe {
|
||||
LLVMZludaBuildAlloca(
|
||||
@ -1143,7 +1195,7 @@ impl<'a> MethodEmitContext<'a> {
|
||||
let cos = self.emit_intrinsic(
|
||||
c"llvm.cos.f32",
|
||||
Some(arguments.dst),
|
||||
&ast::ScalarType::F32.into(),
|
||||
Some(&ast::ScalarType::F32.into()),
|
||||
vec![(self.resolver.value(arguments.src)?, llvm_f32)],
|
||||
)?;
|
||||
unsafe { LLVMZludaSetFastMathFlags(cos, LLVMZludaFastMathApproxFunc) }
|
||||
@ -1396,7 +1448,7 @@ impl<'a> MethodEmitContext<'a> {
|
||||
let sin = self.emit_intrinsic(
|
||||
c"llvm.sin.f32",
|
||||
Some(arguments.dst),
|
||||
&ast::ScalarType::F32.into(),
|
||||
Some(&ast::ScalarType::F32.into()),
|
||||
vec![(self.resolver.value(arguments.src)?, llvm_f32)],
|
||||
)?;
|
||||
unsafe { LLVMZludaSetFastMathFlags(sin, LLVMZludaFastMathApproxFunc) }
|
||||
@ -1407,12 +1459,12 @@ impl<'a> MethodEmitContext<'a> {
|
||||
&mut self,
|
||||
name: &CStr,
|
||||
dst: Option<SpirvWord>,
|
||||
return_type: &ast::Type,
|
||||
return_type: Option<&ast::Type>,
|
||||
arguments: Vec<(LLVMValueRef, LLVMTypeRef)>,
|
||||
) -> Result<LLVMValueRef, TranslateError> {
|
||||
let fn_type = get_function_type(
|
||||
self.context,
|
||||
iter::once(return_type),
|
||||
return_type.into_iter(),
|
||||
arguments.iter().map(|(_, type_)| Ok(*type_)),
|
||||
)?;
|
||||
let mut fn_ = unsafe { LLVMGetNamedFunction(self.module, name.as_ptr()) };
|
||||
@ -1573,7 +1625,7 @@ impl<'a> MethodEmitContext<'a> {
|
||||
return self.emit_cvt_float_to_int(
|
||||
data.from,
|
||||
data.to,
|
||||
integer_rounding.unwrap_or(ast::RoundingMode::NearestEven),
|
||||
integer_rounding,
|
||||
arguments,
|
||||
Some(LLVMBuildFPToSI),
|
||||
)
|
||||
@ -1631,7 +1683,7 @@ impl<'a> MethodEmitContext<'a> {
|
||||
let clamped = self.emit_intrinsic(
|
||||
c"llvm.umin",
|
||||
None,
|
||||
&from.into(),
|
||||
Some(&from.into()),
|
||||
vec![
|
||||
(self.resolver.value(arguments.src)?, from_llvm),
|
||||
(max, from_llvm),
|
||||
@ -1661,7 +1713,7 @@ impl<'a> MethodEmitContext<'a> {
|
||||
let zero_clamped = self.emit_intrinsic(
|
||||
unsafe { CStr::from_bytes_with_nul_unchecked(zero_clamp_intrinsic.as_bytes()) },
|
||||
None,
|
||||
&from.into(),
|
||||
Some(&from.into()),
|
||||
vec![
|
||||
(self.resolver.value(arguments.src)?, from_llvm),
|
||||
(zero, from_llvm),
|
||||
@ -1680,7 +1732,7 @@ impl<'a> MethodEmitContext<'a> {
|
||||
let fully_clamped = self.emit_intrinsic(
|
||||
unsafe { CStr::from_bytes_with_nul_unchecked(max_clamp_intrinsic.as_bytes()) },
|
||||
None,
|
||||
&from.into(),
|
||||
Some(&from.into()),
|
||||
vec![(zero_clamped, from_llvm), (max, from_llvm)],
|
||||
)?;
|
||||
let resize_fn = if to.layout().size() >= from.layout().size() {
|
||||
@ -1720,7 +1772,7 @@ impl<'a> MethodEmitContext<'a> {
|
||||
let rounded_float = self.emit_intrinsic(
|
||||
unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
|
||||
None,
|
||||
&from.into(),
|
||||
Some(&from.into()),
|
||||
vec![(
|
||||
self.resolver.value(arguments.src)?,
|
||||
get_scalar_type(self.context, from),
|
||||
@ -1789,7 +1841,7 @@ impl<'a> MethodEmitContext<'a> {
|
||||
self.emit_intrinsic(
|
||||
intrinsic,
|
||||
Some(arguments.dst),
|
||||
&data.type_.into(),
|
||||
Some(&data.type_.into()),
|
||||
vec![(self.resolver.value(arguments.src)?, type_)],
|
||||
)?;
|
||||
Ok(())
|
||||
@ -1810,7 +1862,7 @@ impl<'a> MethodEmitContext<'a> {
|
||||
self.emit_intrinsic(
|
||||
intrinsic,
|
||||
Some(arguments.dst),
|
||||
&data.type_.into(),
|
||||
Some(&data.type_.into()),
|
||||
vec![(self.resolver.value(arguments.src)?, type_)],
|
||||
)?;
|
||||
Ok(())
|
||||
@ -1832,7 +1884,7 @@ impl<'a> MethodEmitContext<'a> {
|
||||
self.emit_intrinsic(
|
||||
intrinsic,
|
||||
Some(arguments.dst),
|
||||
&data.type_.into(),
|
||||
Some(&data.type_.into()),
|
||||
vec![(self.resolver.value(arguments.src)?, type_)],
|
||||
)?;
|
||||
Ok(())
|
||||
@ -1954,7 +2006,7 @@ impl<'a> MethodEmitContext<'a> {
|
||||
self.emit_intrinsic(
|
||||
intrinsic,
|
||||
Some(arguments.dst),
|
||||
&data.type_.into(),
|
||||
Some(&data.type_.into()),
|
||||
vec![(
|
||||
self.resolver.value(arguments.src)?,
|
||||
get_scalar_type(self.context, data.type_),
|
||||
@ -1971,7 +2023,7 @@ impl<'a> MethodEmitContext<'a> {
|
||||
self.emit_intrinsic(
|
||||
c"llvm.amdgcn.log.f32",
|
||||
Some(arguments.dst),
|
||||
&ast::ScalarType::F32.into(),
|
||||
Some(&ast::ScalarType::F32.into()),
|
||||
vec![(
|
||||
self.resolver.value(arguments.src)?,
|
||||
get_scalar_type(self.context, ast::ScalarType::F32.into()),
|
||||
@ -2026,7 +2078,7 @@ impl<'a> MethodEmitContext<'a> {
|
||||
self.emit_intrinsic(
|
||||
intrinsic,
|
||||
Some(arguments.dst),
|
||||
&type_.into(),
|
||||
Some(&type_.into()),
|
||||
vec![(self.resolver.value(arguments.src)?, llvm_type)],
|
||||
)?;
|
||||
Ok(())
|
||||
@ -2050,7 +2102,7 @@ impl<'a> MethodEmitContext<'a> {
|
||||
self.emit_intrinsic(
|
||||
unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
|
||||
Some(arguments.dst),
|
||||
&data.type_().into(),
|
||||
Some(&data.type_().into()),
|
||||
vec![
|
||||
(self.resolver.value(arguments.src1)?, llvm_type),
|
||||
(self.resolver.value(arguments.src2)?, llvm_type),
|
||||
@ -2077,7 +2129,7 @@ impl<'a> MethodEmitContext<'a> {
|
||||
self.emit_intrinsic(
|
||||
unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
|
||||
Some(arguments.dst),
|
||||
&data.type_().into(),
|
||||
Some(&data.type_().into()),
|
||||
vec![
|
||||
(self.resolver.value(arguments.src1)?, llvm_type),
|
||||
(self.resolver.value(arguments.src2)?, llvm_type),
|
||||
@ -2095,7 +2147,7 @@ impl<'a> MethodEmitContext<'a> {
|
||||
self.emit_intrinsic(
|
||||
unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
|
||||
Some(arguments.dst),
|
||||
&data.type_.into(),
|
||||
Some(&data.type_.into()),
|
||||
vec![
|
||||
(
|
||||
self.resolver.value(arguments.src1)?,
|
||||
@ -2216,7 +2268,7 @@ impl<'a> MethodEmitContext<'a> {
|
||||
self.emit_intrinsic(
|
||||
unsafe { CStr::from_bytes_with_nul_unchecked(llvm_intrinsic.as_bytes()) },
|
||||
Some(arguments.dst),
|
||||
&data.type_.into(),
|
||||
Some(&data.type_.into()),
|
||||
intrinsic_arguments,
|
||||
)?;
|
||||
Ok(())
|
||||
@ -2229,13 +2281,69 @@ impl<'a> MethodEmitContext<'a> {
|
||||
) -> Result<(), TranslateError> {
|
||||
let src1 = self.resolver.value(arguments.src1)?;
|
||||
let src2 = self.resolver.value(arguments.src2)?;
|
||||
self.emit_intrinsic(c"llvm.amdgcn.mul.u24", Some(arguments.dst), &ast::Type::Scalar(data.type_), vec![
|
||||
(src1, get_scalar_type(self.context, data.type_)),
|
||||
(src2, get_scalar_type(self.context, data.type_)),
|
||||
])?;
|
||||
self.emit_intrinsic(
|
||||
c"llvm.amdgcn.mul.u24",
|
||||
Some(arguments.dst),
|
||||
Some(&ast::Type::Scalar(data.type_)),
|
||||
vec![
|
||||
(src1, get_scalar_type(self.context, data.type_)),
|
||||
(src2, get_scalar_type(self.context, data.type_)),
|
||||
],
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
fn emit_set_mode(&mut self, mode_reg: ModeRegister) -> Result<(), TranslateError> {
|
||||
fn hwreg(reg: u32, offset: u32, size: u32) -> u32 {
|
||||
reg | (offset << 6) | ((size - 1) << 11)
|
||||
}
|
||||
fn denormal_to_value(ftz: bool) -> u32 {
|
||||
if ftz {
|
||||
0
|
||||
} else {
|
||||
3
|
||||
}
|
||||
}
|
||||
fn rounding_to_value(ftz: ast::RoundingMode) -> u32 {
|
||||
match ftz {
|
||||
ptx_parser::RoundingMode::NearestEven => 0,
|
||||
ptx_parser::RoundingMode::Zero => 3,
|
||||
ptx_parser::RoundingMode::NegativeInf => 2,
|
||||
ptx_parser::RoundingMode::PositiveInf => 1,
|
||||
}
|
||||
}
|
||||
fn merge_regs(f32: u32, f16f64: u32) -> u32 {
|
||||
f32 | f16f64 << 2
|
||||
}
|
||||
let intrinsic = c"llvm.amdgcn.s.setreg";
|
||||
let (hwreg, value) = match mode_reg {
|
||||
ModeRegister::Denormal { f32, f16f64 } => {
|
||||
let hwreg = hwreg(1, 4, 4);
|
||||
let f32 = denormal_to_value(f32);
|
||||
let f16f64 = denormal_to_value(f16f64);
|
||||
let value = merge_regs(f32, f16f64);
|
||||
(hwreg, value)
|
||||
}
|
||||
ModeRegister::Rounding { f32, f16f64 } => {
|
||||
let hwreg = hwreg(1, 0, 4);
|
||||
let f32 = rounding_to_value(f32);
|
||||
let f16f64 = rounding_to_value(f16f64);
|
||||
let value = merge_regs(f32, f16f64);
|
||||
(hwreg, value)
|
||||
}
|
||||
};
|
||||
let llvm_i32 = get_scalar_type(self.context, ast::ScalarType::B32);
|
||||
let hwreg_llvm = unsafe { LLVMConstInt(llvm_i32, hwreg as _, 0) };
|
||||
let value_llvm = unsafe { LLVMConstInt(llvm_i32, value as _, 0) };
|
||||
self.emit_intrinsic(
|
||||
intrinsic,
|
||||
None,
|
||||
None,
|
||||
vec![(hwreg_llvm, llvm_i32), (value_llvm, llvm_i32)],
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/*
|
||||
// Currently unused, LLVM 18 (ROCm 6.2) does not support `llvm.set.rounding`
|
||||
// Should be available in LLVM 19
|
||||
|
@ -2,8 +2,8 @@ use super::*;
|
||||
|
||||
pub(super) fn run<'a, 'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
directives: Vec<UnconditionalDirective<'input>>,
|
||||
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
directives: Vec<UnconditionalDirective>,
|
||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
directives
|
||||
.into_iter()
|
||||
.map(|directive| run_directive(resolver, directive))
|
||||
@ -13,11 +13,10 @@ pub(super) fn run<'a, 'input>(
|
||||
fn run_directive<'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
directive: Directive2<
|
||||
'input,
|
||||
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
|
||||
ast::ParsedOperand<SpirvWord>,
|
||||
>,
|
||||
) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
Ok(match directive {
|
||||
Directive2::Variable(linking, var) => Directive2::Variable(linking, var),
|
||||
Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?),
|
||||
@ -27,11 +26,10 @@ fn run_directive<'input>(
|
||||
fn run_method<'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
method: Function2<
|
||||
'input,
|
||||
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
|
||||
ast::ParsedOperand<SpirvWord>,
|
||||
>,
|
||||
) -> Result<Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
) -> Result<Function2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
let body = method
|
||||
.body
|
||||
.map(|statements| {
|
||||
@ -43,12 +41,18 @@ fn run_method<'input>(
|
||||
})
|
||||
.transpose()?;
|
||||
Ok(Function2 {
|
||||
func_decl: method.func_decl,
|
||||
globals: method.globals,
|
||||
body,
|
||||
return_arguments: method.return_arguments,
|
||||
name: method.name,
|
||||
input_arguments: method.input_arguments,
|
||||
import_as: method.import_as,
|
||||
tuning: method.tuning,
|
||||
linkage: method.linkage,
|
||||
is_kernel: method.is_kernel,
|
||||
flush_to_zero_f32: method.flush_to_zero_f32,
|
||||
flush_to_zero_f16f64: method.flush_to_zero_f16f64,
|
||||
rounding_mode_f32: method.rounding_mode_f32,
|
||||
rounding_mode_f16f64: method.rounding_mode_f16f64,
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -1,30 +1,33 @@
|
||||
use super::*;
|
||||
|
||||
pub(super) fn run<'a, 'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
resolver: &'a mut GlobalStringIdentResolver2<'input>,
|
||||
special_registers: &'a SpecialRegistersMap2,
|
||||
directives: Vec<UnconditionalDirective<'input>>,
|
||||
) -> Result<Vec<UnconditionalDirective<'input>>, TranslateError> {
|
||||
let declarations = SpecialRegistersMap2::generate_declarations(resolver);
|
||||
let mut result = Vec::with_capacity(declarations.len() + directives.len());
|
||||
directives: Vec<UnconditionalDirective>,
|
||||
) -> Result<Vec<UnconditionalDirective>, TranslateError> {
|
||||
let mut result = Vec::with_capacity(SpecialRegistersMap2::len() + directives.len());
|
||||
let mut sreg_to_function =
|
||||
FxHashMap::with_capacity_and_hasher(declarations.len(), Default::default());
|
||||
for (sreg, declaration) in declarations {
|
||||
let name = if let ast::MethodName::Func(name) = declaration.name {
|
||||
name
|
||||
} else {
|
||||
return Err(error_unreachable());
|
||||
};
|
||||
result.push(UnconditionalDirective::Method(UnconditionalFunction {
|
||||
func_decl: declaration,
|
||||
globals: Vec::new(),
|
||||
body: None,
|
||||
import_as: None,
|
||||
tuning: Vec::new(),
|
||||
linkage: ast::LinkingDirective::EXTERN,
|
||||
}));
|
||||
sreg_to_function.insert(sreg, name);
|
||||
}
|
||||
FxHashMap::with_capacity_and_hasher(SpecialRegistersMap2::len(), Default::default());
|
||||
SpecialRegistersMap2::foreach_declaration(
|
||||
resolver,
|
||||
|sreg, (return_arguments, name, input_arguments)| {
|
||||
result.push(UnconditionalDirective::Method(UnconditionalFunction {
|
||||
return_arguments,
|
||||
name,
|
||||
input_arguments,
|
||||
body: None,
|
||||
import_as: None,
|
||||
tuning: Vec::new(),
|
||||
linkage: ast::LinkingDirective::EXTERN,
|
||||
is_kernel: false,
|
||||
flush_to_zero_f32: false,
|
||||
flush_to_zero_f16f64: false,
|
||||
rounding_mode_f32: ptx_parser::RoundingMode::NearestEven,
|
||||
rounding_mode_f16f64: ptx_parser::RoundingMode::NearestEven,
|
||||
}));
|
||||
sreg_to_function.insert(sreg, name);
|
||||
},
|
||||
);
|
||||
let mut visitor = SpecialRegisterResolver {
|
||||
resolver,
|
||||
special_registers,
|
||||
@ -39,8 +42,8 @@ pub(super) fn run<'a, 'input>(
|
||||
|
||||
fn run_directive<'a, 'input>(
|
||||
visitor: &mut SpecialRegisterResolver<'a, 'input>,
|
||||
directive: UnconditionalDirective<'input>,
|
||||
) -> Result<UnconditionalDirective<'input>, TranslateError> {
|
||||
directive: UnconditionalDirective,
|
||||
) -> Result<UnconditionalDirective, TranslateError> {
|
||||
Ok(match directive {
|
||||
var @ Directive2::Variable(..) => var,
|
||||
Directive2::Method(method) => Directive2::Method(run_method(visitor, method)?),
|
||||
@ -49,8 +52,8 @@ fn run_directive<'a, 'input>(
|
||||
|
||||
fn run_method<'a, 'input>(
|
||||
visitor: &mut SpecialRegisterResolver<'a, 'input>,
|
||||
method: UnconditionalFunction<'input>,
|
||||
) -> Result<UnconditionalFunction<'input>, TranslateError> {
|
||||
method: UnconditionalFunction,
|
||||
) -> Result<UnconditionalFunction, TranslateError> {
|
||||
let body = method
|
||||
.body
|
||||
.map(|statements| {
|
||||
@ -61,14 +64,7 @@ fn run_method<'a, 'input>(
|
||||
Ok::<_, TranslateError>(result)
|
||||
})
|
||||
.transpose()?;
|
||||
Ok(Function2 {
|
||||
func_decl: method.func_decl,
|
||||
globals: method.globals,
|
||||
body,
|
||||
import_as: method.import_as,
|
||||
tuning: method.tuning,
|
||||
linkage: method.linkage,
|
||||
})
|
||||
Ok(Function2 { body, ..method })
|
||||
}
|
||||
|
||||
fn run_statement<'a, 'input>(
|
||||
|
@ -1,8 +1,8 @@
|
||||
use super::*;
|
||||
|
||||
pub(super) fn run<'input>(
|
||||
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
let mut result = Vec::with_capacity(directives.len());
|
||||
for mut directive in directives.into_iter() {
|
||||
run_directive(&mut result, &mut directive)?;
|
||||
@ -12,8 +12,8 @@ pub(super) fn run<'input>(
|
||||
}
|
||||
|
||||
fn run_directive<'input>(
|
||||
result: &mut Vec<Directive2<'input, ptx_parser::Instruction<SpirvWord>, SpirvWord>>,
|
||||
directive: &mut Directive2<'input, ptx_parser::Instruction<SpirvWord>, SpirvWord>,
|
||||
result: &mut Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
directive: &mut Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<(), TranslateError> {
|
||||
match directive {
|
||||
Directive2::Variable(..) => {}
|
||||
@ -23,8 +23,8 @@ fn run_directive<'input>(
|
||||
}
|
||||
|
||||
fn run_function<'input>(
|
||||
result: &mut Vec<Directive2<'input, ptx_parser::Instruction<SpirvWord>, SpirvWord>>,
|
||||
function: &mut Function2<'input, ptx_parser::Instruction<SpirvWord>, SpirvWord>,
|
||||
result: &mut Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
function: &mut Function2<ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) {
|
||||
function.body = function.body.take().map(|statements| {
|
||||
statements
|
||||
|
@ -11,8 +11,8 @@ use super::*;
|
||||
// pass, so we do nothing there
|
||||
pub(super) fn run<'a, 'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
directives
|
||||
.into_iter()
|
||||
.map(|directive| run_directive(resolver, directive))
|
||||
@ -21,8 +21,8 @@ pub(super) fn run<'a, 'input>(
|
||||
|
||||
fn run_directive<'a, 'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
directive: Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
directive: Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
Ok(match directive {
|
||||
var @ Directive2::Variable(..) => var,
|
||||
Directive2::Method(method) => {
|
||||
@ -34,12 +34,11 @@ fn run_directive<'a, 'input>(
|
||||
|
||||
fn run_method<'a, 'input>(
|
||||
mut visitor: InsertMemSSAVisitor<'a, 'input>,
|
||||
method: Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
let mut func_decl = method.func_decl;
|
||||
let is_kernel = func_decl.name.is_kernel();
|
||||
mut method: Function2<ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Function2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
let is_kernel = method.is_kernel;
|
||||
if is_kernel {
|
||||
for arg in func_decl.input_arguments.iter_mut() {
|
||||
for arg in method.input_arguments.iter_mut() {
|
||||
let old_name = arg.name;
|
||||
let old_space = arg.state_space;
|
||||
let new_space = ast::StateSpace::ParamEntry;
|
||||
@ -51,10 +50,10 @@ fn run_method<'a, 'input>(
|
||||
arg.state_space = new_space;
|
||||
}
|
||||
};
|
||||
for arg in func_decl.return_arguments.iter_mut() {
|
||||
for arg in method.return_arguments.iter_mut() {
|
||||
visitor.visit_variable(arg)?;
|
||||
}
|
||||
let return_arguments = &func_decl.return_arguments[..];
|
||||
let return_arguments = &method.return_arguments[..];
|
||||
let body = method
|
||||
.body
|
||||
.map(move |statements| {
|
||||
@ -65,14 +64,7 @@ fn run_method<'a, 'input>(
|
||||
Ok::<_, TranslateError>(result)
|
||||
})
|
||||
.transpose()?;
|
||||
Ok(Function2 {
|
||||
func_decl: func_decl,
|
||||
globals: method.globals,
|
||||
body,
|
||||
import_as: method.import_as,
|
||||
tuning: method.tuning,
|
||||
linkage: method.linkage,
|
||||
})
|
||||
Ok(Function2 { body, ..method })
|
||||
}
|
||||
|
||||
fn run_statement<'a, 'input>(
|
||||
|
@ -19,8 +19,8 @@ use ptx_parser as ast;
|
||||
*/
|
||||
pub(super) fn run<'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
directives
|
||||
.into_iter()
|
||||
.map(|directive| run_directive(resolver, directive))
|
||||
@ -29,8 +29,8 @@ pub(super) fn run<'input>(
|
||||
|
||||
fn run_directive<'a, 'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
directive: Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
directive: Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
Ok(match directive {
|
||||
var @ Directive2::Variable(..) => var,
|
||||
Directive2::Method(mut method) => {
|
||||
|
@ -0,0 +1,29 @@
|
||||
.version 6.5
|
||||
.target sm_50
|
||||
.address_size 64
|
||||
|
||||
.func use_modes();
|
||||
|
||||
.visible .entry kernel()
|
||||
{
|
||||
.reg .f32 temp;
|
||||
|
||||
add.rz.ftz.f32 temp, temp, temp;
|
||||
call use_modes;
|
||||
add.rp.ftz.f32 temp, temp, temp;
|
||||
ret;
|
||||
}
|
||||
|
||||
.func use_modes()
|
||||
{
|
||||
.reg .f32 temp;
|
||||
.reg .pred pred;
|
||||
@pred bra SET_RM;
|
||||
@!pred bra SET_RZ;
|
||||
SET_RM:
|
||||
add.rm.f32 temp, temp, temp;
|
||||
ret;
|
||||
SET_RZ:
|
||||
add.rz.f32 temp, temp, temp;
|
||||
ret;
|
||||
}
|
@ -0,0 +1,15 @@
|
||||
.version 6.5
|
||||
.target sm_30
|
||||
.address_size 64
|
||||
|
||||
.visible .entry add()
|
||||
{
|
||||
.reg .f32 temp<3>;
|
||||
|
||||
add.ftz.f16 temp2, temp1, temp0;
|
||||
add.ftz.f32 temp2, temp1, temp0;
|
||||
|
||||
add.f16 temp2, temp1, temp0;
|
||||
add.f32 temp2, temp1, temp0;
|
||||
ret;
|
||||
}
|
1907
ptx/src/pass/instruction_mode_to_global_mode/mod.rs
Normal file
1907
ptx/src/pass/instruction_mode_to_global_mode/mod.rs
Normal file
File diff suppressed because it is too large
Load Diff
399
ptx/src/pass/instruction_mode_to_global_mode/test.rs
Normal file
399
ptx/src/pass/instruction_mode_to_global_mode/test.rs
Normal file
@ -0,0 +1,399 @@
|
||||
use super::*;
|
||||
use int_enum::IntEnum;
|
||||
use strum::EnumCount;
|
||||
|
||||
#[repr(usize)]
|
||||
#[derive(IntEnum, Eq, PartialEq, Copy, Clone, Debug)]
|
||||
enum Bool {
|
||||
False = 0,
|
||||
True = 1,
|
||||
}
|
||||
|
||||
fn ftz() -> InstructionModes {
|
||||
InstructionModes {
|
||||
denormal_f32: Some(DenormalMode::FlushToZero),
|
||||
denormal_f16f64: None,
|
||||
rounding_f32: None,
|
||||
rounding_f16f64: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn preserve() -> InstructionModes {
|
||||
InstructionModes {
|
||||
denormal_f32: Some(DenormalMode::Preserve),
|
||||
denormal_f16f64: None,
|
||||
rounding_f32: None,
|
||||
rounding_f16f64: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn transitive_mixed() {
|
||||
let mut graph = ControlFlowGraph::new();
|
||||
let entry_id = SpirvWord(1);
|
||||
let false_id = SpirvWord(2);
|
||||
let empty_id = SpirvWord(3);
|
||||
let false2_id = SpirvWord(4);
|
||||
let entry = graph.add_entry_basic_block(entry_id);
|
||||
graph.add_jump(entry, false_id);
|
||||
let false_ = graph.get_or_add_basic_block(false_id);
|
||||
graph.set_modes(false_, ftz(), ftz());
|
||||
graph.add_jump(false_, empty_id);
|
||||
let empty = graph.get_or_add_basic_block(empty_id);
|
||||
graph.add_jump(empty, false2_id);
|
||||
let false2_ = graph.get_or_add_basic_block(false2_id);
|
||||
graph.set_modes(false2_, ftz(), ftz());
|
||||
let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32);
|
||||
assert_eq!(partial_result.bb_must_insert_mode.len(), 0);
|
||||
assert_eq!(partial_result.bb_maybe_insert_mode.len(), 1);
|
||||
assert_eq!(
|
||||
partial_result.bb_maybe_insert_mode[&false_id],
|
||||
(DenormalMode::FlushToZero, iter::once(entry_id).collect())
|
||||
);
|
||||
|
||||
let result = optimize_mode_insertions::<DenormalMode, { DenormalMode::COUNT }>(partial_result);
|
||||
assert_eq!(result.basic_blocks.len(), 0);
|
||||
assert_eq!(result.kernels.len(), 1);
|
||||
assert_eq!(result.kernels[&entry_id], DenormalMode::FlushToZero);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn transitive_change_twice() {
|
||||
let mut graph = ControlFlowGraph::new();
|
||||
let entry_id = SpirvWord(1);
|
||||
let false_id = SpirvWord(2);
|
||||
let empty_id = SpirvWord(3);
|
||||
let true_id = SpirvWord(4);
|
||||
let entry = graph.add_entry_basic_block(entry_id);
|
||||
graph.add_jump(entry, false_id);
|
||||
let false_ = graph.get_or_add_basic_block(false_id);
|
||||
graph.set_modes(false_, ftz(), ftz());
|
||||
graph.add_jump(false_, empty_id);
|
||||
let empty = graph.get_or_add_basic_block(empty_id);
|
||||
graph.add_jump(empty, true_id);
|
||||
let true_ = graph.get_or_add_basic_block(true_id);
|
||||
graph.set_modes(true_, preserve(), preserve());
|
||||
let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32);
|
||||
assert_eq!(partial_result.bb_must_insert_mode.len(), 1);
|
||||
assert!(partial_result.bb_must_insert_mode.contains(&true_id));
|
||||
assert_eq!(partial_result.bb_maybe_insert_mode.len(), 1);
|
||||
assert_eq!(
|
||||
partial_result.bb_maybe_insert_mode[&false_id],
|
||||
(DenormalMode::FlushToZero, iter::once(entry_id).collect())
|
||||
);
|
||||
|
||||
let result = optimize_mode_insertions::<DenormalMode, { DenormalMode::COUNT }>(partial_result);
|
||||
assert_eq!(result.basic_blocks, iter::once(true_id).collect());
|
||||
assert_eq!(result.kernels.len(), 1);
|
||||
assert_eq!(result.kernels[&entry_id], DenormalMode::FlushToZero);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn transitive_change() {
|
||||
let mut graph = ControlFlowGraph::new();
|
||||
let entry_id = SpirvWord(1);
|
||||
let empty_id = SpirvWord(2);
|
||||
let true_id = SpirvWord(3);
|
||||
let entry = graph.add_entry_basic_block(entry_id);
|
||||
graph.add_jump(entry, empty_id);
|
||||
let empty = graph.get_or_add_basic_block(empty_id);
|
||||
graph.add_jump(empty, true_id);
|
||||
let true_ = graph.get_or_add_basic_block(true_id);
|
||||
graph.set_modes(true_, preserve(), preserve());
|
||||
let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32);
|
||||
assert_eq!(partial_result.bb_must_insert_mode.len(), 0);
|
||||
assert_eq!(partial_result.bb_maybe_insert_mode.len(), 1);
|
||||
assert_eq!(
|
||||
partial_result.bb_maybe_insert_mode[&true_id],
|
||||
(DenormalMode::Preserve, iter::once(entry_id).collect())
|
||||
);
|
||||
|
||||
let result = optimize_mode_insertions::<DenormalMode, { DenormalMode::COUNT }>(partial_result);
|
||||
assert_eq!(result.basic_blocks.len(), 0);
|
||||
assert_eq!(result.kernels.len(), 1);
|
||||
assert_eq!(result.kernels[&entry_id], DenormalMode::Preserve);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn codependency() {
|
||||
let mut graph = ControlFlowGraph::new();
|
||||
let entry_id = SpirvWord(1);
|
||||
let left_f_id = SpirvWord(2);
|
||||
let right_f_id = SpirvWord(3);
|
||||
let left_none_id = SpirvWord(4);
|
||||
let mid_none_id = SpirvWord(5);
|
||||
let right_none_id = SpirvWord(6);
|
||||
let entry = graph.add_entry_basic_block(entry_id);
|
||||
graph.add_jump(entry, left_f_id);
|
||||
graph.add_jump(entry, right_f_id);
|
||||
let left_f = graph.get_or_add_basic_block(left_f_id);
|
||||
graph.set_modes(left_f, ftz(), ftz());
|
||||
let right_f = graph.get_or_add_basic_block(right_f_id);
|
||||
graph.set_modes(right_f, ftz(), ftz());
|
||||
graph.add_jump(left_f, left_none_id);
|
||||
let left_none = graph.get_or_add_basic_block(left_none_id);
|
||||
graph.add_jump(right_f, right_none_id);
|
||||
let right_none = graph.get_or_add_basic_block(right_none_id);
|
||||
graph.add_jump(left_none, mid_none_id);
|
||||
graph.add_jump(right_none, mid_none_id);
|
||||
let mid_none = graph.get_or_add_basic_block(mid_none_id);
|
||||
graph.add_jump(mid_none, left_none_id);
|
||||
graph.add_jump(mid_none, right_none_id);
|
||||
//println!(
|
||||
// "{:?}",
|
||||
// petgraph::dot::Dot::with_config(&graph.graph, &[petgraph::dot::Config::EdgeNoLabel])
|
||||
//);
|
||||
let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32);
|
||||
assert_eq!(partial_result.bb_must_insert_mode.len(), 0);
|
||||
assert_eq!(partial_result.bb_maybe_insert_mode.len(), 2);
|
||||
assert_eq!(
|
||||
partial_result.bb_maybe_insert_mode[&left_f_id],
|
||||
(DenormalMode::FlushToZero, iter::once(entry_id).collect())
|
||||
);
|
||||
assert_eq!(
|
||||
partial_result.bb_maybe_insert_mode[&right_f_id],
|
||||
(DenormalMode::FlushToZero, iter::once(entry_id).collect())
|
||||
);
|
||||
|
||||
let result = optimize_mode_insertions::<DenormalMode, { DenormalMode::COUNT }>(partial_result);
|
||||
assert_eq!(result.basic_blocks.len(), 0);
|
||||
assert_eq!(result.kernels.len(), 1);
|
||||
assert_eq!(result.kernels[&entry_id], DenormalMode::FlushToZero);
|
||||
}
|
||||
|
||||
static FOLD_DENORMAL_PTX: &'static str = include_str!("fold_denormal.ptx");
|
||||
|
||||
#[test]
|
||||
fn fold_denormal() {
|
||||
let method = compile_methods(FOLD_DENORMAL_PTX).pop().unwrap();
|
||||
assert_eq!(true, method.flush_to_zero_f32);
|
||||
assert_eq!(true, method.flush_to_zero_f16f64);
|
||||
let method_body = method.body.unwrap();
|
||||
assert!(matches!(
|
||||
&*method_body,
|
||||
[
|
||||
Statement::Label(..),
|
||||
Statement::Variable(..),
|
||||
Statement::Variable(..),
|
||||
Statement::Variable(..),
|
||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
||||
Statement::SetMode(ModeRegister::Denormal {
|
||||
f32: false,
|
||||
f16f64: false
|
||||
}),
|
||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
||||
Statement::Instruction(ast::Instruction::Ret { .. }),
|
||||
]
|
||||
));
|
||||
}
|
||||
|
||||
fn compile_methods(ptx: &str) -> Vec<Function2<ast::Instruction<SpirvWord>, SpirvWord>> {
|
||||
use crate::pass::*;
|
||||
|
||||
let module = ptx_parser::parse_module_checked(ptx).unwrap();
|
||||
let mut flat_resolver = GlobalStringIdentResolver2::new(SpirvWord(1));
|
||||
let mut scoped_resolver = ScopedResolver::new(&mut flat_resolver);
|
||||
let directives = normalize_identifiers2::run(&mut scoped_resolver, module.directives).unwrap();
|
||||
let directives = normalize_predicates2::run(&mut flat_resolver, directives).unwrap();
|
||||
let directives = expand_operands::run(&mut flat_resolver, directives).unwrap();
|
||||
let directives = normalize_basic_blocks::run(&mut flat_resolver, directives).unwrap();
|
||||
let directives = super::run(&mut flat_resolver, directives).unwrap();
|
||||
directives
|
||||
.into_iter()
|
||||
.filter_map(|s| match s {
|
||||
Directive2::Method(m) => Some(m),
|
||||
_ => None,
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
static CALL_WITH_MODE_PTX: &'static str = include_str!("call_with_mode.ptx");
|
||||
|
||||
#[test]
|
||||
fn call_with_mode() {
|
||||
let methods = compile_methods(CALL_WITH_MODE_PTX);
|
||||
|
||||
assert!(matches!(methods[0].body, None));
|
||||
|
||||
let method_1 = methods[1].body.as_ref().unwrap();
|
||||
assert!(matches!(
|
||||
&**method_1,
|
||||
[
|
||||
Statement::Label(..),
|
||||
Statement::Variable(..),
|
||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
||||
Statement::Instruction(ast::Instruction::Call { .. }),
|
||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
||||
Statement::Label(..),
|
||||
// Dual prelude
|
||||
Statement::SetMode(ModeRegister::Denormal {
|
||||
f32: true,
|
||||
f16f64: true
|
||||
}),
|
||||
Statement::SetMode(ModeRegister::Rounding {
|
||||
f32: ast::RoundingMode::PositiveInf,
|
||||
f16f64: ast::RoundingMode::NearestEven
|
||||
}),
|
||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
||||
// Denormal prelude
|
||||
Statement::Label(..),
|
||||
Statement::SetMode(ModeRegister::Denormal {
|
||||
f32: true,
|
||||
f16f64: true
|
||||
}),
|
||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
||||
// Rounding prelude
|
||||
Statement::Label(..),
|
||||
Statement::SetMode(ModeRegister::Rounding {
|
||||
f32: ast::RoundingMode::PositiveInf,
|
||||
f16f64: ast::RoundingMode::NearestEven
|
||||
}),
|
||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
||||
Statement::Label(..),
|
||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
||||
Statement::Instruction(ast::Instruction::Ret { .. }),
|
||||
]
|
||||
));
|
||||
let [to_fn0] = calls(method_1);
|
||||
let [_, dual_prelude, _, _, add] = labels(method_1);
|
||||
let [post_call, post_prelude_dual, post_prelude_denormal, post_prelude_rounding] =
|
||||
branches(method_1);
|
||||
assert_eq!(methods[0].name, to_fn0);
|
||||
assert_eq!(post_call, dual_prelude);
|
||||
assert_eq!(post_prelude_dual, add);
|
||||
assert_eq!(post_prelude_denormal, add);
|
||||
assert_eq!(post_prelude_rounding, add);
|
||||
|
||||
let method_2 = methods[2].body.as_ref().unwrap();
|
||||
assert!(matches!(
|
||||
&**method_2,
|
||||
[
|
||||
Statement::Label(..),
|
||||
Statement::Variable(..),
|
||||
Statement::Variable(..),
|
||||
Statement::Conditional(..),
|
||||
Statement::Label(..),
|
||||
Statement::Conditional(..),
|
||||
Statement::Label(..),
|
||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
||||
Statement::Label(..),
|
||||
// Dual prelude
|
||||
Statement::SetMode(ModeRegister::Denormal {
|
||||
f32: false,
|
||||
f16f64: true
|
||||
}),
|
||||
Statement::SetMode(ModeRegister::Rounding {
|
||||
f32: ast::RoundingMode::NegativeInf,
|
||||
f16f64: ast::RoundingMode::NearestEven
|
||||
}),
|
||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
||||
// Denormal prelude
|
||||
Statement::Label(..),
|
||||
Statement::SetMode(ModeRegister::Denormal {
|
||||
f32: false,
|
||||
f16f64: true
|
||||
}),
|
||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
||||
// Rounding prelude
|
||||
Statement::Label(..),
|
||||
Statement::SetMode(ModeRegister::Rounding {
|
||||
f32: ast::RoundingMode::NegativeInf,
|
||||
f16f64: ast::RoundingMode::NearestEven
|
||||
}),
|
||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
||||
Statement::Label(..),
|
||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
||||
Statement::Label(..),
|
||||
Statement::SetMode(ModeRegister::Denormal {
|
||||
f32: false,
|
||||
f16f64: true
|
||||
}),
|
||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
||||
Statement::Label(..),
|
||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
||||
Statement::Label(..),
|
||||
Statement::Instruction(ast::Instruction::Ret { .. }),
|
||||
]
|
||||
));
|
||||
let [(if_rm_true, if_rm_false), (if_rz_true, if_rz_false)] = conditionals(method_2);
|
||||
let [_, conditional2, post_conditional2, prelude_dual, _, _, add1, add2_set_denormal, add2, ret] =
|
||||
labels(method_2);
|
||||
let [post_conditional2_jump, post_prelude_dual, post_prelude_denormal, post_prelude_rounding, post_add1, post_add2_set_denormal, post_add2] =
|
||||
branches(method_2);
|
||||
assert_eq!(if_rm_true, prelude_dual);
|
||||
assert_eq!(if_rm_false, conditional2);
|
||||
assert_eq!(if_rz_true, post_conditional2);
|
||||
assert_eq!(if_rz_false, add2_set_denormal);
|
||||
assert_eq!(post_conditional2_jump, prelude_dual);
|
||||
assert_eq!(post_prelude_dual, add1);
|
||||
assert_eq!(post_prelude_denormal, add1);
|
||||
assert_eq!(post_prelude_rounding, add1);
|
||||
assert_eq!(post_add1, ret);
|
||||
assert_eq!(post_add2_set_denormal, add2);
|
||||
assert_eq!(post_add2, ret);
|
||||
}
|
||||
|
||||
fn branches<const N: usize>(
|
||||
fn_: &Vec<Statement<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> [SpirvWord; N] {
|
||||
fn_.iter()
|
||||
.filter_map(|s| match s {
|
||||
Statement::Instruction(ast::Instruction::Bra {
|
||||
arguments: ast::BraArgs { src },
|
||||
}) => Some(*src),
|
||||
_ => None,
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.try_into()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn labels<const N: usize>(
|
||||
fn_: &Vec<Statement<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> [SpirvWord; N] {
|
||||
fn_.iter()
|
||||
.filter_map(
|
||||
|s: &Statement<ptx_parser::Instruction<SpirvWord>, SpirvWord>| match s {
|
||||
Statement::Label(label) => Some(*label),
|
||||
_ => None,
|
||||
},
|
||||
)
|
||||
.collect::<Vec<_>>()
|
||||
.try_into()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn calls<const N: usize>(
|
||||
fn_: &Vec<Statement<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> [SpirvWord; N] {
|
||||
fn_.iter()
|
||||
.filter_map(|s| match s {
|
||||
Statement::Instruction(ast::Instruction::Call {
|
||||
arguments: ast::CallArgs { func, .. },
|
||||
..
|
||||
}) => Some(*func),
|
||||
_ => None,
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.try_into()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn conditionals<const N: usize>(
|
||||
fn_: &Vec<Statement<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> [(SpirvWord, SpirvWord); N] {
|
||||
fn_.iter()
|
||||
.filter_map(|s| match s {
|
||||
Statement::Conditional(BrachCondition {
|
||||
if_true, if_false, ..
|
||||
}) => Some((*if_true, *if_false)),
|
||||
_ => None,
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.try_into()
|
||||
.unwrap()
|
||||
}
|
@ -17,12 +17,15 @@ mod expand_operands;
|
||||
mod fix_special_registers2;
|
||||
mod hoist_globals;
|
||||
mod insert_explicit_load_store;
|
||||
mod instruction_mode_to_global_mode;
|
||||
mod insert_implicit_conversions2;
|
||||
mod normalize_basic_blocks;
|
||||
mod normalize_identifiers2;
|
||||
mod normalize_predicates2;
|
||||
mod remove_unreachable_basic_blocks;
|
||||
mod replace_instructions_with_function_calls;
|
||||
mod resolve_function_pointers;
|
||||
mod replace_known_functions;
|
||||
mod resolve_function_pointers;
|
||||
|
||||
static ZLUDA_PTX_IMPL: &'static [u8] = include_bytes!("../../lib/zluda_ptx_impl.bc");
|
||||
const ZLUDA_PTX_PREFIX: &'static str = "__zluda_ptx_impl_";
|
||||
@ -43,12 +46,15 @@ pub fn to_llvm_module<'input>(ast: ast::Module<'input>) -> Result<Module, Transl
|
||||
let mut scoped_resolver = ScopedResolver::new(&mut flat_resolver);
|
||||
let sreg_map = SpecialRegistersMap2::new(&mut scoped_resolver)?;
|
||||
let directives = normalize_identifiers2::run(&mut scoped_resolver, ast.directives)?;
|
||||
let directives = replace_known_functions::run(&flat_resolver, directives);
|
||||
let directives = replace_known_functions::run(&mut flat_resolver, directives);
|
||||
let directives = normalize_predicates2::run(&mut flat_resolver, directives)?;
|
||||
let directives = resolve_function_pointers::run(directives)?;
|
||||
let directives: Vec<Directive2<'_, ptx_parser::Instruction<ptx_parser::ParsedOperand<SpirvWord>>, ptx_parser::ParsedOperand<SpirvWord>>> = fix_special_registers2::run(&mut flat_resolver, &sreg_map, directives)?;
|
||||
let directives = fix_special_registers2::run(&mut flat_resolver, &sreg_map, directives)?;
|
||||
let directives = expand_operands::run(&mut flat_resolver, directives)?;
|
||||
let directives = deparamize_functions::run(&mut flat_resolver, directives)?;
|
||||
let directives = normalize_basic_blocks::run(&mut flat_resolver, directives)?;
|
||||
let directives = remove_unreachable_basic_blocks::run(directives)?;
|
||||
let directives = instruction_mode_to_global_mode::run(&mut flat_resolver, directives)?;
|
||||
let directives = insert_explicit_load_store::run(&mut flat_resolver, directives)?;
|
||||
let directives = insert_implicit_conversions2::run(&mut flat_resolver, directives)?;
|
||||
let directives = replace_instructions_with_function_calls::run(&mut flat_resolver, directives)?;
|
||||
@ -195,6 +201,20 @@ enum Statement<I, P: ast::Operand> {
|
||||
FunctionPointer(FunctionPointerDetails),
|
||||
VectorRead(VectorRead),
|
||||
VectorWrite(VectorWrite),
|
||||
SetMode(ModeRegister),
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Clone, Copy)]
|
||||
#[cfg_attr(test, derive(Debug))]
|
||||
enum ModeRegister {
|
||||
Denormal {
|
||||
f32: bool,
|
||||
f16f64: bool,
|
||||
},
|
||||
Rounding {
|
||||
f32: ast::RoundingMode,
|
||||
f16f64: ast::RoundingMode,
|
||||
},
|
||||
}
|
||||
|
||||
impl<T: ast::Operand<Ident = SpirvWord>> Statement<ast::Instruction<T>, T> {
|
||||
@ -467,6 +487,7 @@ impl<T: ast::Operand<Ident = SpirvWord>> Statement<ast::Instruction<T>, T> {
|
||||
let src = visitor.visit_ident(src, None, false, false)?;
|
||||
Statement::FunctionPointer(FunctionPointerDetails { dst, src })
|
||||
}
|
||||
Statement::SetMode(mode_register) => Statement::SetMode(mode_register),
|
||||
})
|
||||
}
|
||||
}
|
||||
@ -525,7 +546,7 @@ struct FunctionPointerDetails {
|
||||
src: SpirvWord,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
|
||||
#[derive(Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Debug)]
|
||||
pub struct SpirvWord(u32);
|
||||
|
||||
impl From<u32> for SpirvWord {
|
||||
@ -557,22 +578,27 @@ type NormalizedStatement = Statement<
|
||||
ast::ParsedOperand<SpirvWord>,
|
||||
>;
|
||||
|
||||
enum Directive2<'input, Instruction, Operand: ast::Operand> {
|
||||
enum Directive2<Instruction, Operand: ast::Operand> {
|
||||
Variable(ast::LinkingDirective, ast::Variable<SpirvWord>),
|
||||
Method(Function2<'input, Instruction, Operand>),
|
||||
Method(Function2<Instruction, Operand>),
|
||||
}
|
||||
|
||||
struct Function2<'input, Instruction, Operand: ast::Operand> {
|
||||
pub func_decl: ast::MethodDeclaration<'input, SpirvWord>,
|
||||
pub globals: Vec<ast::Variable<SpirvWord>>,
|
||||
struct Function2<Instruction, Operand: ast::Operand> {
|
||||
pub return_arguments: Vec<ast::Variable<Operand::Ident>>,
|
||||
pub name: Operand::Ident,
|
||||
pub input_arguments: Vec<ast::Variable<Operand::Ident>>,
|
||||
pub body: Option<Vec<Statement<Instruction, Operand>>>,
|
||||
is_kernel: bool,
|
||||
import_as: Option<String>,
|
||||
tuning: Vec<ast::TuningDirective>,
|
||||
linkage: ast::LinkingDirective,
|
||||
flush_to_zero_f32: bool,
|
||||
flush_to_zero_f16f64: bool,
|
||||
rounding_mode_f32: ast::RoundingMode,
|
||||
rounding_mode_f16f64: ast::RoundingMode,
|
||||
}
|
||||
|
||||
type NormalizedDirective2<'input> = Directive2<
|
||||
'input,
|
||||
type NormalizedDirective2 = Directive2<
|
||||
(
|
||||
Option<ast::PredAt<SpirvWord>>,
|
||||
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
|
||||
@ -580,8 +606,7 @@ type NormalizedDirective2<'input> = Directive2<
|
||||
ast::ParsedOperand<SpirvWord>,
|
||||
>;
|
||||
|
||||
type NormalizedFunction2<'input> = Function2<
|
||||
'input,
|
||||
type NormalizedFunction2 = Function2<
|
||||
(
|
||||
Option<ast::PredAt<SpirvWord>>,
|
||||
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
|
||||
@ -589,17 +614,11 @@ type NormalizedFunction2<'input> = Function2<
|
||||
ast::ParsedOperand<SpirvWord>,
|
||||
>;
|
||||
|
||||
type UnconditionalDirective<'input> = Directive2<
|
||||
'input,
|
||||
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
|
||||
ast::ParsedOperand<SpirvWord>,
|
||||
>;
|
||||
type UnconditionalDirective =
|
||||
Directive2<ast::Instruction<ast::ParsedOperand<SpirvWord>>, ast::ParsedOperand<SpirvWord>>;
|
||||
|
||||
type UnconditionalFunction<'input> = Function2<
|
||||
'input,
|
||||
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
|
||||
ast::ParsedOperand<SpirvWord>,
|
||||
>;
|
||||
type UnconditionalFunction =
|
||||
Function2<ast::Instruction<ast::ParsedOperand<SpirvWord>>, ast::ParsedOperand<SpirvWord>>;
|
||||
|
||||
struct GlobalStringIdentResolver2<'input> {
|
||||
pub(crate) current_id: SpirvWord,
|
||||
@ -805,47 +824,45 @@ impl SpecialRegistersMap2 {
|
||||
self.id_to_reg.get(&id).copied()
|
||||
}
|
||||
|
||||
fn generate_declarations<'a, 'input>(
|
||||
fn len() -> usize {
|
||||
PtxSpecialRegister::iter().len()
|
||||
}
|
||||
|
||||
fn foreach_declaration<'a, 'input>(
|
||||
resolver: &'a mut GlobalStringIdentResolver2<'input>,
|
||||
) -> impl ExactSizeIterator<
|
||||
Item = (
|
||||
mut fn_: impl FnMut(
|
||||
PtxSpecialRegister,
|
||||
ast::MethodDeclaration<'input, SpirvWord>,
|
||||
(
|
||||
Vec<ast::Variable<SpirvWord>>,
|
||||
SpirvWord,
|
||||
Vec<ast::Variable<SpirvWord>>,
|
||||
),
|
||||
),
|
||||
> + 'a {
|
||||
PtxSpecialRegister::iter().map(|sreg| {
|
||||
) {
|
||||
for sreg in PtxSpecialRegister::iter() {
|
||||
let external_fn_name = [ZLUDA_PTX_PREFIX, sreg.get_unprefixed_function_name()].concat();
|
||||
let name =
|
||||
ast::MethodName::Func(resolver.register_named(Cow::Owned(external_fn_name), None));
|
||||
let name = resolver.register_named(Cow::Owned(external_fn_name), None);
|
||||
let return_type = sreg.get_function_return_type();
|
||||
let input_type = sreg.get_function_input_type();
|
||||
(
|
||||
sreg,
|
||||
ast::MethodDeclaration {
|
||||
return_arguments: vec![ast::Variable {
|
||||
align: None,
|
||||
v_type: return_type.into(),
|
||||
state_space: ast::StateSpace::Reg,
|
||||
name: resolver
|
||||
.register_unnamed(Some((return_type.into(), ast::StateSpace::Reg))),
|
||||
array_init: Vec::new(),
|
||||
}],
|
||||
name: name,
|
||||
input_arguments: input_type
|
||||
.into_iter()
|
||||
.map(|type_| ast::Variable {
|
||||
align: None,
|
||||
v_type: type_.into(),
|
||||
state_space: ast::StateSpace::Reg,
|
||||
name: resolver
|
||||
.register_unnamed(Some((type_.into(), ast::StateSpace::Reg))),
|
||||
array_init: Vec::new(),
|
||||
})
|
||||
.collect::<Vec<_>>(),
|
||||
shared_mem: None,
|
||||
},
|
||||
)
|
||||
})
|
||||
let return_arguments = vec![ast::Variable {
|
||||
align: None,
|
||||
v_type: return_type.into(),
|
||||
state_space: ast::StateSpace::Reg,
|
||||
name: resolver.register_unnamed(Some((return_type.into(), ast::StateSpace::Reg))),
|
||||
array_init: Vec::new(),
|
||||
}];
|
||||
let input_arguments = input_type
|
||||
.into_iter()
|
||||
.map(|type_| ast::Variable {
|
||||
align: None,
|
||||
v_type: type_.into(),
|
||||
state_space: ast::StateSpace::Reg,
|
||||
name: resolver.register_unnamed(Some((type_.into(), ast::StateSpace::Reg))),
|
||||
array_init: Vec::new(),
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
fn_(sreg, (return_arguments, name, input_arguments));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
134
ptx/src/pass/normalize_basic_blocks.rs
Normal file
134
ptx/src/pass/normalize_basic_blocks.rs
Normal file
@ -0,0 +1,134 @@
|
||||
use super::*;
|
||||
|
||||
// This pass normalizes ptx modules in two ways that makes mode computation pass
|
||||
// and code emissions passes much simpler:
|
||||
// * Inserts label at the start of every function
|
||||
// This makes control flow graph simpler in mode computation block: we can
|
||||
// represent kernels as separate nodes with its own separate entry/exit mode
|
||||
// * Inserts label at the start of every basic block
|
||||
// * Insert explicit jumps before labels
|
||||
// * Non-.entry methods get a single `ret;` exit point - this is because mode computation
|
||||
// logic requires it. Control flow graph constructed by mode computation
|
||||
// models function calls as jumps into and then from another function.
|
||||
// If this cfg allowed multiple return basic blocks then there would be cases
|
||||
// where we want to insert mode setting instruction along the edge between
|
||||
// `ret;` and bb in the caller. This is only possible if there's a single
|
||||
// edge between from function `ret;` and caller
|
||||
pub(crate) fn run(
|
||||
flat_resolver: &mut GlobalStringIdentResolver2<'_>,
|
||||
mut directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
for directive in directives.iter_mut() {
|
||||
let (body_ref, is_kernel) = match directive {
|
||||
Directive2::Method(Function2 {
|
||||
body: Some(body), is_kernel, ..
|
||||
}) => (body, *is_kernel),
|
||||
_ => continue,
|
||||
};
|
||||
let body = std::mem::replace(body_ref, Vec::new());
|
||||
let mut result = Vec::with_capacity(body.len());
|
||||
let mut previous_instruction_was_terminator = TerminatorKind::Not;
|
||||
let mut body_iterator = body.into_iter();
|
||||
let mut return_statements = Vec::new();
|
||||
match body_iterator.next() {
|
||||
Some(Statement::Label(_)) => {}
|
||||
Some(statement) => {
|
||||
result.push(Statement::Label(flat_resolver.register_unnamed(None)));
|
||||
result.push(statement);
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
for statement in body_iterator {
|
||||
match previous_instruction_was_terminator {
|
||||
TerminatorKind::Not => match statement {
|
||||
Statement::Label(label) => {
|
||||
result.push(Statement::Instruction(ast::Instruction::Bra {
|
||||
arguments: ast::BraArgs { src: label },
|
||||
}))
|
||||
}
|
||||
_ => {}
|
||||
},
|
||||
TerminatorKind::Real => {
|
||||
if !matches!(statement, Statement::Label(..)) {
|
||||
result.push(Statement::Label(flat_resolver.register_unnamed(None)));
|
||||
}
|
||||
}
|
||||
TerminatorKind::Fake => match statement {
|
||||
// If there's a label after a call just reuse it
|
||||
Statement::Label(label) => {
|
||||
result.push(Statement::Instruction(ast::Instruction::Bra {
|
||||
arguments: ast::BraArgs { src: label },
|
||||
}))
|
||||
}
|
||||
_ => {
|
||||
let label = flat_resolver.register_unnamed(None);
|
||||
result.push(Statement::Instruction(ast::Instruction::Bra {
|
||||
arguments: ast::BraArgs { src: label },
|
||||
}));
|
||||
result.push(Statement::Label(label));
|
||||
}
|
||||
},
|
||||
}
|
||||
match statement {
|
||||
Statement::RetValue(..) => {
|
||||
return Err(error_unreachable());
|
||||
}
|
||||
Statement::Instruction(ast::Instruction::Ret { .. }) => {
|
||||
if !is_kernel {
|
||||
return_statements.push(result.len());
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
previous_instruction_was_terminator = is_block_terminator(&statement);
|
||||
result.push(statement);
|
||||
}
|
||||
convert_from_multiple_returns_to_single_return(
|
||||
flat_resolver,
|
||||
&mut result,
|
||||
return_statements,
|
||||
)?;
|
||||
*body_ref = result;
|
||||
}
|
||||
Ok(directives)
|
||||
}
|
||||
|
||||
enum TerminatorKind {
|
||||
Not,
|
||||
Real,
|
||||
Fake,
|
||||
}
|
||||
|
||||
fn convert_from_multiple_returns_to_single_return(
|
||||
flat_resolver: &mut GlobalStringIdentResolver2<'_>,
|
||||
result: &mut Vec<Statement<ptx_parser::Instruction<SpirvWord>, SpirvWord>>,
|
||||
return_statements: Vec<usize>,
|
||||
) -> Result<(), TranslateError> {
|
||||
Ok(if return_statements.len() > 1 {
|
||||
let ret_bb = flat_resolver.register_unnamed(None);
|
||||
result.push(Statement::Label(ret_bb));
|
||||
result.push(Statement::Instruction(ast::Instruction::Ret {
|
||||
data: ast::RetData { uniform: false },
|
||||
}));
|
||||
for ret_index in return_statements {
|
||||
let statement = result.get_mut(ret_index).ok_or_else(error_unreachable)?;
|
||||
*statement = Statement::Instruction(ast::Instruction::Bra {
|
||||
arguments: ast::BraArgs { src: ret_bb },
|
||||
});
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn is_block_terminator(
|
||||
statement: &Statement<ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> TerminatorKind {
|
||||
match statement {
|
||||
Statement::Conditional(..)
|
||||
| Statement::Instruction(ast::Instruction::Bra { .. })
|
||||
// Normally call is not a terminator, but we treat it as such because it
|
||||
// makes the "instruction modes to global modes" pass possible
|
||||
| Statement::Instruction(ast::Instruction::Ret { .. }) => TerminatorKind::Real,
|
||||
Statement::Instruction(ast::Instruction::Call { .. }) => TerminatorKind::Fake,
|
||||
_ => TerminatorKind::Not,
|
||||
}
|
||||
}
|
@ -4,7 +4,7 @@ use ptx_parser as ast;
|
||||
pub(crate) fn run<'input, 'b>(
|
||||
resolver: &mut ScopedResolver<'input, 'b>,
|
||||
directives: Vec<ast::Directive<'input, ast::ParsedOperand<&'input str>>>,
|
||||
) -> Result<Vec<NormalizedDirective2<'input>>, TranslateError> {
|
||||
) -> Result<Vec<NormalizedDirective2>, TranslateError> {
|
||||
resolver.start_scope();
|
||||
let result = directives
|
||||
.into_iter()
|
||||
@ -17,7 +17,7 @@ pub(crate) fn run<'input, 'b>(
|
||||
fn run_directive<'input, 'b>(
|
||||
resolver: &mut ScopedResolver<'input, 'b>,
|
||||
directive: ast::Directive<'input, ast::ParsedOperand<&'input str>>,
|
||||
) -> Result<NormalizedDirective2<'input>, TranslateError> {
|
||||
) -> Result<NormalizedDirective2, TranslateError> {
|
||||
Ok(match directive {
|
||||
ast::Directive::Variable(linking, var) => {
|
||||
NormalizedDirective2::Variable(linking, run_variable(resolver, var)?)
|
||||
@ -32,15 +32,11 @@ fn run_method<'input, 'b>(
|
||||
resolver: &mut ScopedResolver<'input, 'b>,
|
||||
linkage: ast::LinkingDirective,
|
||||
method: ast::Function<'input, &'input str, ast::Statement<ast::ParsedOperand<&'input str>>>,
|
||||
) -> Result<NormalizedFunction2<'input>, TranslateError> {
|
||||
let name = match method.func_directive.name {
|
||||
ast::MethodName::Kernel(name) => ast::MethodName::Kernel(name),
|
||||
ast::MethodName::Func(text) => {
|
||||
ast::MethodName::Func(resolver.add_or_get_in_current_scope_untyped(text)?)
|
||||
}
|
||||
};
|
||||
) -> Result<NormalizedFunction2, TranslateError> {
|
||||
let is_kernel = method.func_directive.name.is_kernel();
|
||||
let name = resolver.add_or_get_in_current_scope_untyped(method.func_directive.name.text())?;
|
||||
resolver.start_scope();
|
||||
let func_decl = run_function_decl(resolver, method.func_directive, name)?;
|
||||
let (return_arguments, input_arguments) = run_function_decl(resolver, method.func_directive)?;
|
||||
let body = method
|
||||
.body
|
||||
.map(|statements| {
|
||||
@ -51,20 +47,25 @@ fn run_method<'input, 'b>(
|
||||
.transpose()?;
|
||||
resolver.end_scope();
|
||||
Ok(Function2 {
|
||||
func_decl,
|
||||
globals: Vec::new(),
|
||||
return_arguments,
|
||||
name,
|
||||
input_arguments,
|
||||
body,
|
||||
import_as: None,
|
||||
tuning: method.tuning,
|
||||
linkage,
|
||||
is_kernel,
|
||||
tuning: method.tuning,
|
||||
flush_to_zero_f32: false,
|
||||
flush_to_zero_f16f64: false,
|
||||
rounding_mode_f32: ptx_parser::RoundingMode::NearestEven,
|
||||
rounding_mode_f16f64: ptx_parser::RoundingMode::NearestEven,
|
||||
})
|
||||
}
|
||||
|
||||
fn run_function_decl<'input, 'b>(
|
||||
resolver: &mut ScopedResolver<'input, 'b>,
|
||||
func_directive: ast::MethodDeclaration<'input, &'input str>,
|
||||
name: ast::MethodName<'input, SpirvWord>,
|
||||
) -> Result<ast::MethodDeclaration<'input, SpirvWord>, TranslateError> {
|
||||
) -> Result<(Vec<ast::Variable<SpirvWord>>, Vec<ast::Variable<SpirvWord>>), TranslateError> {
|
||||
assert!(func_directive.shared_mem.is_none());
|
||||
let return_arguments = func_directive
|
||||
.return_arguments
|
||||
@ -76,12 +77,7 @@ fn run_function_decl<'input, 'b>(
|
||||
.into_iter()
|
||||
.map(|var| run_variable(resolver, var))
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
Ok(ast::MethodDeclaration {
|
||||
return_arguments,
|
||||
name,
|
||||
input_arguments,
|
||||
shared_mem: None,
|
||||
})
|
||||
Ok((return_arguments, input_arguments))
|
||||
}
|
||||
|
||||
fn run_variable<'input, 'b>(
|
||||
|
@ -3,8 +3,8 @@ use ptx_parser as ast;
|
||||
|
||||
pub(crate) fn run<'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
directives: Vec<NormalizedDirective2<'input>>,
|
||||
) -> Result<Vec<UnconditionalDirective<'input>>, TranslateError> {
|
||||
directives: Vec<NormalizedDirective2>,
|
||||
) -> Result<Vec<UnconditionalDirective>, TranslateError> {
|
||||
directives
|
||||
.into_iter()
|
||||
.map(|directive| run_directive(resolver, directive))
|
||||
@ -13,8 +13,8 @@ pub(crate) fn run<'input>(
|
||||
|
||||
fn run_directive<'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
directive: NormalizedDirective2<'input>,
|
||||
) -> Result<UnconditionalDirective<'input>, TranslateError> {
|
||||
directive: NormalizedDirective2,
|
||||
) -> Result<UnconditionalDirective, TranslateError> {
|
||||
Ok(match directive {
|
||||
Directive2::Variable(linking, var) => Directive2::Variable(linking, var),
|
||||
Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?),
|
||||
@ -23,8 +23,8 @@ fn run_directive<'input>(
|
||||
|
||||
fn run_method<'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
method: NormalizedFunction2<'input>,
|
||||
) -> Result<UnconditionalFunction<'input>, TranslateError> {
|
||||
method: NormalizedFunction2,
|
||||
) -> Result<UnconditionalFunction, TranslateError> {
|
||||
let body = method
|
||||
.body
|
||||
.map(|statements| {
|
||||
@ -36,12 +36,18 @@ fn run_method<'input>(
|
||||
})
|
||||
.transpose()?;
|
||||
Ok(Function2 {
|
||||
func_decl: method.func_decl,
|
||||
globals: method.globals,
|
||||
body,
|
||||
return_arguments: method.return_arguments,
|
||||
name: method.name,
|
||||
input_arguments: method.input_arguments,
|
||||
import_as: method.import_as,
|
||||
tuning: method.tuning,
|
||||
linkage: method.linkage,
|
||||
is_kernel: method.is_kernel,
|
||||
flush_to_zero_f32: method.flush_to_zero_f32,
|
||||
flush_to_zero_f16f64: method.flush_to_zero_f16f64,
|
||||
rounding_mode_f32: method.rounding_mode_f32,
|
||||
rounding_mode_f16f64: method.rounding_mode_f16f64,
|
||||
})
|
||||
}
|
||||
|
||||
|
122
ptx/src/pass/remove_unreachable_basic_blocks.rs
Normal file
122
ptx/src/pass/remove_unreachable_basic_blocks.rs
Normal file
@ -0,0 +1,122 @@
|
||||
use super::*;
|
||||
use petgraph::{
|
||||
graph::NodeIndex,
|
||||
visit::{Bfs, VisitMap},
|
||||
Graph,
|
||||
};
|
||||
use rustc_hash::FxHashSet;
|
||||
|
||||
pub(crate) fn run(
|
||||
mut directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
let mut reachable_funcs = FxHashSet::default();
|
||||
for directive in directives.iter_mut() {
|
||||
match directive {
|
||||
Directive2::Method(Function2 {
|
||||
body: Some(body), ..
|
||||
}) => {
|
||||
let old_body = std::mem::replace(body, Vec::new());
|
||||
let mut cfg = ControlFlowGraph::new();
|
||||
let mut old_body_iter = old_body.iter();
|
||||
let mut current_bb = match old_body_iter.next() {
|
||||
Some(Statement::Label(label)) => cfg.add_or_get_node(*label),
|
||||
_ => return Err(error_unreachable()),
|
||||
};
|
||||
let first_bb = current_bb;
|
||||
for statement in old_body_iter {
|
||||
match statement {
|
||||
Statement::Label(label) => {
|
||||
current_bb = cfg.add_or_get_node(*label);
|
||||
}
|
||||
Statement::Conditional(branch) => {
|
||||
cfg.add_branch(current_bb, branch.if_true);
|
||||
cfg.add_branch(current_bb, branch.if_false);
|
||||
}
|
||||
Statement::Instruction(ast::Instruction::Bra {
|
||||
arguments: ast::BraArgs { src },
|
||||
}) => {
|
||||
cfg.add_branch(current_bb, *src);
|
||||
}
|
||||
Statement::FunctionPointer(FunctionPointerDetails {
|
||||
src: _func, ..
|
||||
}) => {
|
||||
return Err(error_todo());
|
||||
}
|
||||
Statement::Instruction(ast::Instruction::Call {
|
||||
arguments: ast::CallArgs { func, .. },
|
||||
..
|
||||
}) => {
|
||||
reachable_funcs.insert(*func);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
let mut bfs = Bfs::new(&cfg.graph, first_bb);
|
||||
while let Some(_) = bfs.next(&cfg.graph) {}
|
||||
let mut visited = true;
|
||||
*body = try_filter_to_vec(old_body.into_iter(), |statement| {
|
||||
match statement {
|
||||
Statement::Label(label) => {
|
||||
visited = bfs
|
||||
.discovered
|
||||
.is_visited(cfg.nodes.get(label).ok_or_else(error_unreachable)?);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
Ok(visited)
|
||||
})?;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok(directives
|
||||
.into_iter()
|
||||
.filter(|directive| match directive {
|
||||
Directive2::Variable(..) => true,
|
||||
Directive2::Method(Function2 {
|
||||
name, is_kernel, ..
|
||||
}) => *is_kernel || reachable_funcs.contains(name),
|
||||
})
|
||||
.collect::<Vec<_>>())
|
||||
}
|
||||
|
||||
fn try_filter_to_vec<T, E>(
|
||||
mut iter: impl ExactSizeIterator<Item = T>,
|
||||
mut filter: impl FnMut(&T) -> Result<bool, E>,
|
||||
) -> Result<Vec<T>, E> {
|
||||
iter.try_fold(Vec::with_capacity(iter.len()), |mut vec, item| {
|
||||
match filter(&item) {
|
||||
Ok(true) => vec.push(item),
|
||||
Ok(false) => {}
|
||||
Err(err) => return Err(err),
|
||||
}
|
||||
Ok(vec)
|
||||
})
|
||||
}
|
||||
|
||||
struct ControlFlowGraph {
|
||||
graph: Graph<SpirvWord, ()>,
|
||||
nodes: FxHashMap<SpirvWord, NodeIndex>,
|
||||
}
|
||||
|
||||
impl ControlFlowGraph {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
graph: Graph::new(),
|
||||
nodes: FxHashMap::default(),
|
||||
}
|
||||
}
|
||||
|
||||
fn add_or_get_node(&mut self, id: SpirvWord) -> NodeIndex {
|
||||
*self
|
||||
.nodes
|
||||
.entry(id)
|
||||
.or_insert_with(|| self.graph.add_node(id))
|
||||
}
|
||||
|
||||
fn add_branch(&mut self, from: NodeIndex, to: SpirvWord) -> NodeIndex {
|
||||
let to = self.add_or_get_node(to);
|
||||
self.graph.add_edge(from, to, ());
|
||||
to
|
||||
}
|
||||
}
|
@ -2,8 +2,8 @@ use super::*;
|
||||
|
||||
pub(super) fn run<'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
let mut fn_declarations = FxHashMap::default();
|
||||
let remapped_directives = directives
|
||||
.into_iter()
|
||||
@ -13,17 +13,18 @@ pub(super) fn run<'input>(
|
||||
.into_iter()
|
||||
.map(|(_, (return_arguments, name, input_arguments))| {
|
||||
Directive2::Method(Function2 {
|
||||
func_decl: ast::MethodDeclaration {
|
||||
return_arguments,
|
||||
name: ast::MethodName::Func(name),
|
||||
input_arguments,
|
||||
shared_mem: None,
|
||||
},
|
||||
globals: Vec::new(),
|
||||
return_arguments,
|
||||
name: name,
|
||||
input_arguments,
|
||||
body: None,
|
||||
import_as: None,
|
||||
tuning: Vec::new(),
|
||||
linkage: ast::LinkingDirective::EXTERN,
|
||||
is_kernel: false,
|
||||
flush_to_zero_f32: false,
|
||||
flush_to_zero_f16f64: false,
|
||||
rounding_mode_f32: ptx_parser::RoundingMode::NearestEven,
|
||||
rounding_mode_f16f64: ptx_parser::RoundingMode::NearestEven,
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
@ -41,8 +42,8 @@ fn run_directive<'input>(
|
||||
Vec<ast::Variable<SpirvWord>>,
|
||||
),
|
||||
>,
|
||||
directive: Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
directive: Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
Ok(match directive {
|
||||
var @ Directive2::Variable(..) => var,
|
||||
Directive2::Method(mut method) => {
|
||||
|
@ -1,14 +1,15 @@
|
||||
use std::borrow::Cow;
|
||||
|
||||
use super::{GlobalStringIdentResolver2, NormalizedDirective2, SpirvWord};
|
||||
|
||||
pub(crate) fn run<'input>(
|
||||
resolver: &GlobalStringIdentResolver2<'input>,
|
||||
mut directives: Vec<NormalizedDirective2<'input>>,
|
||||
) -> Vec<NormalizedDirective2<'input>> {
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
mut directives: Vec<NormalizedDirective2>,
|
||||
) -> Vec<NormalizedDirective2> {
|
||||
for directive in directives.iter_mut() {
|
||||
match directive {
|
||||
NormalizedDirective2::Method(func) => {
|
||||
func.import_as =
|
||||
replace_with_ptx_impl(resolver, &func.func_decl.name, func.import_as.take());
|
||||
replace_with_ptx_impl(resolver, func.name);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
@ -17,22 +18,16 @@ pub(crate) fn run<'input>(
|
||||
}
|
||||
|
||||
fn replace_with_ptx_impl<'input>(
|
||||
resolver: &GlobalStringIdentResolver2<'input>,
|
||||
fn_name: &ptx_parser::MethodName<'input, SpirvWord>,
|
||||
name: Option<String>,
|
||||
) -> Option<String> {
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
fn_name: SpirvWord,
|
||||
) {
|
||||
let known_names = ["__assertfail"];
|
||||
match name {
|
||||
Some(name) if known_names.contains(&&*name) => Some(format!("__zluda_ptx_impl_{}", name)),
|
||||
Some(name) => Some(name),
|
||||
None => match fn_name {
|
||||
ptx_parser::MethodName::Func(name) => match resolver.ident_map.get(name) {
|
||||
Some(super::IdentEntry {
|
||||
name: Some(name), ..
|
||||
}) => Some(format!("__zluda_ptx_impl_{}", name)),
|
||||
_ => None,
|
||||
},
|
||||
ptx_parser::MethodName::Kernel(..) => None,
|
||||
},
|
||||
if let Some(super::IdentEntry {
|
||||
name: Some(name), ..
|
||||
}) = resolver.ident_map.get_mut(&fn_name)
|
||||
{
|
||||
if known_names.contains(&&**name) {
|
||||
*name = Cow::Owned(format!("__zluda_ptx_impl_{}", name));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -3,8 +3,8 @@ use ptx_parser as ast;
|
||||
use rustc_hash::FxHashSet;
|
||||
|
||||
pub(crate) fn run<'input>(
|
||||
directives: Vec<UnconditionalDirective<'input>>,
|
||||
) -> Result<Vec<UnconditionalDirective<'input>>, TranslateError> {
|
||||
directives: Vec<UnconditionalDirective>,
|
||||
) -> Result<Vec<UnconditionalDirective>, TranslateError> {
|
||||
let mut functions = FxHashSet::default();
|
||||
directives
|
||||
.into_iter()
|
||||
@ -14,19 +14,13 @@ pub(crate) fn run<'input>(
|
||||
|
||||
fn run_directive<'input>(
|
||||
functions: &mut FxHashSet<SpirvWord>,
|
||||
directive: UnconditionalDirective<'input>,
|
||||
) -> Result<UnconditionalDirective<'input>, TranslateError> {
|
||||
directive: UnconditionalDirective,
|
||||
) -> Result<UnconditionalDirective, TranslateError> {
|
||||
Ok(match directive {
|
||||
var @ Directive2::Variable(..) => var,
|
||||
Directive2::Method(method) => {
|
||||
{
|
||||
let func_decl = &method.func_decl;
|
||||
match func_decl.name {
|
||||
ptx_parser::MethodName::Kernel(_) => {}
|
||||
ptx_parser::MethodName::Func(name) => {
|
||||
functions.insert(name);
|
||||
}
|
||||
}
|
||||
if !method.is_kernel {
|
||||
functions.insert(method.name);
|
||||
}
|
||||
Directive2::Method(run_method(functions, method)?)
|
||||
}
|
||||
@ -35,8 +29,8 @@ fn run_directive<'input>(
|
||||
|
||||
fn run_method<'input>(
|
||||
functions: &mut FxHashSet<SpirvWord>,
|
||||
method: UnconditionalFunction<'input>,
|
||||
) -> Result<UnconditionalFunction<'input>, TranslateError> {
|
||||
method: UnconditionalFunction,
|
||||
) -> Result<UnconditionalFunction, TranslateError> {
|
||||
let body = method
|
||||
.body
|
||||
.map(|statements| {
|
||||
@ -46,14 +40,7 @@ fn run_method<'input>(
|
||||
.collect::<Result<Vec<_>, _>>()
|
||||
})
|
||||
.transpose()?;
|
||||
Ok(Function2 {
|
||||
func_decl: method.func_decl,
|
||||
globals: method.globals,
|
||||
body,
|
||||
import_as: method.import_as,
|
||||
tuning: method.tuning,
|
||||
linkage: method.linkage,
|
||||
})
|
||||
Ok(Function2 { body, ..method })
|
||||
}
|
||||
|
||||
fn run_statement<'input>(
|
||||
|
@ -1,32 +1,24 @@
|
||||
declare i32 @__zluda_ptx_impl_activemask() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @activemask(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @activemask(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #1 {
|
||||
%"31" = alloca i64, align 8, addrspace(5)
|
||||
%"32" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"37" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"37", ptr addrspace(5) %"35", align 4
|
||||
%"38" = call i32 @__zluda_ptx_impl_activemask()
|
||||
store i32 %"38", ptr addrspace(5) %"36", align 4
|
||||
%"39" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"40" = load i32, ptr addrspace(5) %"36", align 4
|
||||
%"41" = inttoptr i64 %"39" to ptr
|
||||
store i32 %"40", ptr %"41", align 4
|
||||
br label %"28"
|
||||
|
||||
"28": ; preds = %1
|
||||
%"33" = load i64, ptr addrspace(4) %"30", align 4
|
||||
store i64 %"33", ptr addrspace(5) %"31", align 4
|
||||
%"34" = call i32 @__zluda_ptx_impl_activemask()
|
||||
store i32 %"34", ptr addrspace(5) %"32", align 4
|
||||
%"35" = load i64, ptr addrspace(5) %"31", align 4
|
||||
%"36" = load i32, ptr addrspace(5) %"32", align 4
|
||||
%"37" = inttoptr i64 %"35" to ptr
|
||||
store i32 %"36", ptr %"37", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,39 +1,30 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @add(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
define amdgpu_kernel void @add(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"50" = inttoptr i64 %"45" to ptr
|
||||
%"44" = load i64, ptr %"50", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"46" = add i64 %"47", 1
|
||||
store i64 %"46", ptr addrspace(5) %"41", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"51" = inttoptr i64 %"48" to ptr
|
||||
store i64 %"49", ptr %"51", align 4
|
||||
br label %"31"
|
||||
|
||||
"31": ; preds = %1
|
||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"46" = inttoptr i64 %"41" to ptr
|
||||
%"40" = load i64, ptr %"46", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"42" = add i64 %"43", 1
|
||||
store i64 %"42", ptr addrspace(5) %"37", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"47" = inttoptr i64 %"44" to ptr
|
||||
store i64 %"45", ptr %"47", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
52
ptx/src/test/ll/add_ftz.ll
Normal file
52
ptx/src/test/ll/add_ftz.ll
Normal file
@ -0,0 +1,52 @@
|
||||
define amdgpu_kernel void @add_ftz(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca float, align 4, addrspace(5)
|
||||
%"42" = alloca float, align 4, addrspace(5)
|
||||
%"43" = alloca float, align 4, addrspace(5)
|
||||
%"44" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"36"
|
||||
|
||||
"36": ; preds = %1
|
||||
%"45" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"45", ptr addrspace(5) %"39", align 4
|
||||
%"46" = load i64, ptr addrspace(4) %"38", align 4
|
||||
store i64 %"46", ptr addrspace(5) %"40", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"61" = inttoptr i64 %"48" to ptr
|
||||
%"47" = load float, ptr %"61", align 4
|
||||
store float %"47", ptr addrspace(5) %"41", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"62" = inttoptr i64 %"49" to ptr
|
||||
%"33" = getelementptr inbounds i8, ptr %"62", i64 4
|
||||
%"50" = load float, ptr %"33", align 4
|
||||
store float %"50", ptr addrspace(5) %"42", align 4
|
||||
%"52" = load float, ptr addrspace(5) %"41", align 4
|
||||
%"53" = load float, ptr addrspace(5) %"42", align 4
|
||||
%"51" = fadd float %"52", %"53"
|
||||
store float %"51", ptr addrspace(5) %"43", align 4
|
||||
call void @llvm.amdgcn.s.setreg(i32 6401, i32 3)
|
||||
%"55" = load float, ptr addrspace(5) %"41", align 4
|
||||
%"56" = load float, ptr addrspace(5) %"42", align 4
|
||||
%"54" = fadd float %"55", %"56"
|
||||
store float %"54", ptr addrspace(5) %"44", align 4
|
||||
%"57" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"58" = load float, ptr addrspace(5) %"43", align 4
|
||||
%"63" = inttoptr i64 %"57" to ptr
|
||||
store float %"58", ptr %"63", align 4
|
||||
%"59" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"64" = inttoptr i64 %"59" to ptr
|
||||
%"35" = getelementptr inbounds i8, ptr %"64", i64 4
|
||||
%"60" = load float, ptr addrspace(5) %"44", align 4
|
||||
store float %"60", ptr %"35", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind willreturn
|
||||
declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind willreturn }
|
@ -1,39 +1,30 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @add_non_coherent(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
define amdgpu_kernel void @add_non_coherent(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"50" = inttoptr i64 %"45" to ptr addrspace(1)
|
||||
%"44" = load i64, ptr addrspace(1) %"50", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"46" = add i64 %"47", 1
|
||||
store i64 %"46", ptr addrspace(5) %"41", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"51" = inttoptr i64 %"48" to ptr addrspace(1)
|
||||
store i64 %"49", ptr addrspace(1) %"51", align 4
|
||||
br label %"31"
|
||||
|
||||
"31": ; preds = %1
|
||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"46" = inttoptr i64 %"41" to ptr addrspace(1)
|
||||
%"40" = load i64, ptr addrspace(1) %"46", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"42" = add i64 %"43", 1
|
||||
store i64 %"42", ptr addrspace(5) %"37", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"47" = inttoptr i64 %"44" to ptr addrspace(1)
|
||||
store i64 %"45", ptr addrspace(1) %"47", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,39 +1,30 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @add_tuning(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
define amdgpu_kernel void @add_tuning(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"50" = inttoptr i64 %"45" to ptr
|
||||
%"44" = load i64, ptr %"50", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"46" = add i64 %"47", 1
|
||||
store i64 %"46", ptr addrspace(5) %"41", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"51" = inttoptr i64 %"48" to ptr
|
||||
store i64 %"49", ptr %"51", align 4
|
||||
br label %"31"
|
||||
|
||||
"31": ; preds = %1
|
||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"46" = inttoptr i64 %"41" to ptr
|
||||
%"40" = load i64, ptr %"46", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"42" = add i64 %"43", 1
|
||||
store i64 %"42", ptr addrspace(5) %"37", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"47" = inttoptr i64 %"44" to ptr
|
||||
store i64 %"45", ptr %"47", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,45 +1,36 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @and(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i32, align 4, addrspace(5)
|
||||
%"42" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @and(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i32, align 4, addrspace(5)
|
||||
%"38" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"44" = load i64, ptr addrspace(4) %"38", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"54" = inttoptr i64 %"46" to ptr
|
||||
%"45" = load i32, ptr %"54", align 4
|
||||
store i32 %"45", ptr addrspace(5) %"41", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"55" = inttoptr i64 %"47" to ptr
|
||||
%"30" = getelementptr inbounds i8, ptr %"55", i64 4
|
||||
%"48" = load i32, ptr %"30", align 4
|
||||
store i32 %"48", ptr addrspace(5) %"42", align 4
|
||||
%"50" = load i32, ptr addrspace(5) %"41", align 4
|
||||
%"51" = load i32, ptr addrspace(5) %"42", align 4
|
||||
%"56" = and i32 %"50", %"51"
|
||||
store i32 %"56", ptr addrspace(5) %"41", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"53" = load i32, ptr addrspace(5) %"41", align 4
|
||||
%"59" = inttoptr i64 %"52" to ptr
|
||||
store i32 %"53", ptr %"59", align 4
|
||||
br label %"32"
|
||||
|
||||
"32": ; preds = %1
|
||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"50" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load i32, ptr %"50", align 4
|
||||
store i32 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"51" = inttoptr i64 %"43" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
|
||||
%"44" = load i32, ptr %"31", align 4
|
||||
store i32 %"44", ptr addrspace(5) %"38", align 4
|
||||
%"46" = load i32, ptr addrspace(5) %"37", align 4
|
||||
%"47" = load i32, ptr addrspace(5) %"38", align 4
|
||||
%"52" = and i32 %"46", %"47"
|
||||
store i32 %"52", ptr addrspace(5) %"37", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"49" = load i32, ptr addrspace(5) %"37", align 4
|
||||
%"55" = inttoptr i64 %"48" to ptr
|
||||
store i32 %"49", ptr %"55", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,55 +1,46 @@
|
||||
@shared_mem = external addrspace(3) global [1024 x i8], align 4
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @atom_add(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 {
|
||||
%"42" = alloca i64, align 8, addrspace(5)
|
||||
%"43" = alloca i64, align 8, addrspace(5)
|
||||
%"44" = alloca i32, align 4, addrspace(5)
|
||||
%"45" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @atom_add(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i32, align 4, addrspace(5)
|
||||
%"41" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"46" = load i64, ptr addrspace(4) %"40", align 4
|
||||
store i64 %"46", ptr addrspace(5) %"42", align 4
|
||||
%"47" = load i64, ptr addrspace(4) %"41", align 4
|
||||
store i64 %"47", ptr addrspace(5) %"43", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"60" = inttoptr i64 %"49" to ptr
|
||||
%"48" = load i32, ptr %"60", align 4
|
||||
store i32 %"48", ptr addrspace(5) %"44", align 4
|
||||
%"50" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"61" = inttoptr i64 %"50" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"61", i64 4
|
||||
%"51" = load i32, ptr %"31", align 4
|
||||
store i32 %"51", ptr addrspace(5) %"45", align 4
|
||||
%"52" = load i32, ptr addrspace(5) %"44", align 4
|
||||
store i32 %"52", ptr addrspace(3) @shared_mem, align 4
|
||||
%"54" = load i32, ptr addrspace(5) %"45", align 4
|
||||
%2 = atomicrmw add ptr addrspace(3) @shared_mem, i32 %"54" syncscope("agent-one-as") monotonic, align 4
|
||||
store i32 %2, ptr addrspace(5) %"44", align 4
|
||||
%"55" = load i32, ptr addrspace(3) @shared_mem, align 4
|
||||
store i32 %"55", ptr addrspace(5) %"45", align 4
|
||||
%"56" = load i64, ptr addrspace(5) %"43", align 4
|
||||
%"57" = load i32, ptr addrspace(5) %"44", align 4
|
||||
%"65" = inttoptr i64 %"56" to ptr
|
||||
store i32 %"57", ptr %"65", align 4
|
||||
%"58" = load i64, ptr addrspace(5) %"43", align 4
|
||||
%"66" = inttoptr i64 %"58" to ptr
|
||||
%"33" = getelementptr inbounds i8, ptr %"66", i64 4
|
||||
%"59" = load i32, ptr addrspace(5) %"45", align 4
|
||||
store i32 %"59", ptr %"33", align 4
|
||||
br label %"35"
|
||||
|
||||
"35": ; preds = %1
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"56" = inttoptr i64 %"45" to ptr
|
||||
%"44" = load i32, ptr %"56", align 4
|
||||
store i32 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"57" = inttoptr i64 %"46" to ptr
|
||||
%"32" = getelementptr inbounds i8, ptr %"57", i64 4
|
||||
%"47" = load i32, ptr %"32", align 4
|
||||
store i32 %"47", ptr addrspace(5) %"41", align 4
|
||||
%"48" = load i32, ptr addrspace(5) %"40", align 4
|
||||
store i32 %"48", ptr addrspace(3) @shared_mem, align 4
|
||||
%"50" = load i32, ptr addrspace(5) %"41", align 4
|
||||
%2 = atomicrmw add ptr addrspace(3) @shared_mem, i32 %"50" syncscope("agent-one-as") monotonic, align 4
|
||||
store i32 %2, ptr addrspace(5) %"40", align 4
|
||||
%"51" = load i32, ptr addrspace(3) @shared_mem, align 4
|
||||
store i32 %"51", ptr addrspace(5) %"41", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"53" = load i32, ptr addrspace(5) %"40", align 4
|
||||
%"61" = inttoptr i64 %"52" to ptr
|
||||
store i32 %"53", ptr %"61", align 4
|
||||
%"54" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"62" = inttoptr i64 %"54" to ptr
|
||||
%"34" = getelementptr inbounds i8, ptr %"62", i64 4
|
||||
%"55" = load i32, ptr addrspace(5) %"41", align 4
|
||||
store i32 %"55", ptr %"34", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,55 +1,46 @@
|
||||
@shared_mem = external addrspace(3) global [1024 x i8], align 4
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @atom_add_float(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 {
|
||||
%"42" = alloca i64, align 8, addrspace(5)
|
||||
%"43" = alloca i64, align 8, addrspace(5)
|
||||
%"44" = alloca float, align 4, addrspace(5)
|
||||
%"45" = alloca float, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @atom_add_float(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca float, align 4, addrspace(5)
|
||||
%"41" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"46" = load i64, ptr addrspace(4) %"40", align 4
|
||||
store i64 %"46", ptr addrspace(5) %"42", align 4
|
||||
%"47" = load i64, ptr addrspace(4) %"41", align 4
|
||||
store i64 %"47", ptr addrspace(5) %"43", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"60" = inttoptr i64 %"49" to ptr
|
||||
%"48" = load float, ptr %"60", align 4
|
||||
store float %"48", ptr addrspace(5) %"44", align 4
|
||||
%"50" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"61" = inttoptr i64 %"50" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"61", i64 4
|
||||
%"51" = load float, ptr %"31", align 4
|
||||
store float %"51", ptr addrspace(5) %"45", align 4
|
||||
%"52" = load float, ptr addrspace(5) %"44", align 4
|
||||
store float %"52", ptr addrspace(3) @shared_mem, align 4
|
||||
%"54" = load float, ptr addrspace(5) %"45", align 4
|
||||
%2 = atomicrmw fadd ptr addrspace(3) @shared_mem, float %"54" syncscope("agent-one-as") monotonic, align 4
|
||||
store float %2, ptr addrspace(5) %"44", align 4
|
||||
%"55" = load float, ptr addrspace(3) @shared_mem, align 4
|
||||
store float %"55", ptr addrspace(5) %"45", align 4
|
||||
%"56" = load i64, ptr addrspace(5) %"43", align 4
|
||||
%"57" = load float, ptr addrspace(5) %"44", align 4
|
||||
%"65" = inttoptr i64 %"56" to ptr
|
||||
store float %"57", ptr %"65", align 4
|
||||
%"58" = load i64, ptr addrspace(5) %"43", align 4
|
||||
%"66" = inttoptr i64 %"58" to ptr
|
||||
%"33" = getelementptr inbounds i8, ptr %"66", i64 4
|
||||
%"59" = load float, ptr addrspace(5) %"45", align 4
|
||||
store float %"59", ptr %"33", align 4
|
||||
br label %"35"
|
||||
|
||||
"35": ; preds = %1
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"56" = inttoptr i64 %"45" to ptr
|
||||
%"44" = load float, ptr %"56", align 4
|
||||
store float %"44", ptr addrspace(5) %"40", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"57" = inttoptr i64 %"46" to ptr
|
||||
%"32" = getelementptr inbounds i8, ptr %"57", i64 4
|
||||
%"47" = load float, ptr %"32", align 4
|
||||
store float %"47", ptr addrspace(5) %"41", align 4
|
||||
%"48" = load float, ptr addrspace(5) %"40", align 4
|
||||
store float %"48", ptr addrspace(3) @shared_mem, align 4
|
||||
%"50" = load float, ptr addrspace(5) %"41", align 4
|
||||
%2 = atomicrmw fadd ptr addrspace(3) @shared_mem, float %"50" syncscope("agent-one-as") monotonic, align 4
|
||||
store float %2, ptr addrspace(5) %"40", align 4
|
||||
%"51" = load float, ptr addrspace(3) @shared_mem, align 4
|
||||
store float %"51", ptr addrspace(5) %"41", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"53" = load float, ptr addrspace(5) %"40", align 4
|
||||
%"61" = inttoptr i64 %"52" to ptr
|
||||
store float %"53", ptr %"61", align 4
|
||||
%"54" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"62" = inttoptr i64 %"54" to ptr
|
||||
%"34" = getelementptr inbounds i8, ptr %"62", i64 4
|
||||
%"55" = load float, ptr addrspace(5) %"41", align 4
|
||||
store float %"55", ptr %"34", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,53 +1,44 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @atom_cas(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43") #0 {
|
||||
%"44" = alloca i64, align 8, addrspace(5)
|
||||
%"45" = alloca i64, align 8, addrspace(5)
|
||||
%"46" = alloca i32, align 4, addrspace(5)
|
||||
%"47" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @atom_cas(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 {
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
%"42" = alloca i32, align 4, addrspace(5)
|
||||
%"43" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"48" = load i64, ptr addrspace(4) %"42", align 4
|
||||
store i64 %"48", ptr addrspace(5) %"44", align 4
|
||||
%"49" = load i64, ptr addrspace(4) %"43", align 4
|
||||
store i64 %"49", ptr addrspace(5) %"45", align 4
|
||||
%"51" = load i64, ptr addrspace(5) %"44", align 4
|
||||
br label %"37"
|
||||
|
||||
"37": ; preds = %1
|
||||
%"44" = load i64, ptr addrspace(4) %"38", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"45" = load i64, ptr addrspace(4) %"39", align 4
|
||||
store i64 %"45", ptr addrspace(5) %"41", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"57" = inttoptr i64 %"47" to ptr
|
||||
%"46" = load i32, ptr %"57", align 4
|
||||
store i32 %"46", ptr addrspace(5) %"42", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"58" = inttoptr i64 %"48" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"58", i64 4
|
||||
%"50" = load i32, ptr addrspace(5) %"42", align 4
|
||||
%2 = cmpxchg ptr %"31", i32 %"50", i32 100 syncscope("agent-one-as") monotonic monotonic, align 4
|
||||
%"59" = extractvalue { i32, i1 } %2, 0
|
||||
store i32 %"59", ptr addrspace(5) %"42", align 4
|
||||
%"51" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"61" = inttoptr i64 %"51" to ptr
|
||||
%"50" = load i32, ptr %"61", align 4
|
||||
store i32 %"50", ptr addrspace(5) %"46", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"44", align 4
|
||||
%"62" = inttoptr i64 %"52" to ptr
|
||||
%"30" = getelementptr inbounds i8, ptr %"62", i64 4
|
||||
%"54" = load i32, ptr addrspace(5) %"46", align 4
|
||||
%2 = cmpxchg ptr %"30", i32 %"54", i32 100 syncscope("agent-one-as") monotonic monotonic, align 4
|
||||
%"63" = extractvalue { i32, i1 } %2, 0
|
||||
store i32 %"63", ptr addrspace(5) %"46", align 4
|
||||
%"55" = load i64, ptr addrspace(5) %"44", align 4
|
||||
%"65" = inttoptr i64 %"55" to ptr
|
||||
%"33" = getelementptr inbounds i8, ptr %"65", i64 4
|
||||
%"56" = load i32, ptr %"33", align 4
|
||||
store i32 %"56", ptr addrspace(5) %"47", align 4
|
||||
%"57" = load i64, ptr addrspace(5) %"45", align 4
|
||||
%"58" = load i32, ptr addrspace(5) %"46", align 4
|
||||
%"66" = inttoptr i64 %"57" to ptr
|
||||
store i32 %"58", ptr %"66", align 4
|
||||
%"59" = load i64, ptr addrspace(5) %"45", align 4
|
||||
%"67" = inttoptr i64 %"59" to ptr
|
||||
%"35" = getelementptr inbounds i8, ptr %"67", i64 4
|
||||
%"60" = load i32, ptr addrspace(5) %"47", align 4
|
||||
store i32 %"60", ptr %"35", align 4
|
||||
%"34" = getelementptr inbounds i8, ptr %"61", i64 4
|
||||
%"52" = load i32, ptr %"34", align 4
|
||||
store i32 %"52", ptr addrspace(5) %"43", align 4
|
||||
%"53" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"54" = load i32, ptr addrspace(5) %"42", align 4
|
||||
%"62" = inttoptr i64 %"53" to ptr
|
||||
store i32 %"54", ptr %"62", align 4
|
||||
%"55" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"63" = inttoptr i64 %"55" to ptr
|
||||
%"36" = getelementptr inbounds i8, ptr %"63", i64 4
|
||||
%"56" = load i32, ptr addrspace(5) %"43", align 4
|
||||
store i32 %"56", ptr %"36", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,55 +1,46 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @atom_inc(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43") #0 {
|
||||
%"44" = alloca i64, align 8, addrspace(5)
|
||||
%"45" = alloca i64, align 8, addrspace(5)
|
||||
%"46" = alloca i32, align 4, addrspace(5)
|
||||
%"47" = alloca i32, align 4, addrspace(5)
|
||||
%"48" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @atom_inc(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 {
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
%"42" = alloca i32, align 4, addrspace(5)
|
||||
%"43" = alloca i32, align 4, addrspace(5)
|
||||
%"44" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"49" = load i64, ptr addrspace(4) %"42", align 4
|
||||
store i64 %"49", ptr addrspace(5) %"44", align 4
|
||||
%"50" = load i64, ptr addrspace(4) %"43", align 4
|
||||
store i64 %"50", ptr addrspace(5) %"45", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"44", align 4
|
||||
%"63" = inttoptr i64 %"52" to ptr
|
||||
%2 = atomicrmw uinc_wrap ptr %"63", i32 101 syncscope("agent-one-as") monotonic, align 4
|
||||
store i32 %2, ptr addrspace(5) %"46", align 4
|
||||
%"54" = load i64, ptr addrspace(5) %"44", align 4
|
||||
%"64" = inttoptr i64 %"54" to ptr addrspace(1)
|
||||
%3 = atomicrmw uinc_wrap ptr addrspace(1) %"64", i32 101 syncscope("agent-one-as") monotonic, align 4
|
||||
store i32 %3, ptr addrspace(5) %"47", align 4
|
||||
%"56" = load i64, ptr addrspace(5) %"44", align 4
|
||||
%"65" = inttoptr i64 %"56" to ptr
|
||||
%"55" = load i32, ptr %"65", align 4
|
||||
store i32 %"55", ptr addrspace(5) %"48", align 4
|
||||
%"57" = load i64, ptr addrspace(5) %"45", align 4
|
||||
%"58" = load i32, ptr addrspace(5) %"46", align 4
|
||||
%"66" = inttoptr i64 %"57" to ptr
|
||||
store i32 %"58", ptr %"66", align 4
|
||||
%"59" = load i64, ptr addrspace(5) %"45", align 4
|
||||
%"67" = inttoptr i64 %"59" to ptr
|
||||
%"33" = getelementptr inbounds i8, ptr %"67", i64 4
|
||||
%"60" = load i32, ptr addrspace(5) %"47", align 4
|
||||
store i32 %"60", ptr %"33", align 4
|
||||
%"61" = load i64, ptr addrspace(5) %"45", align 4
|
||||
%"68" = inttoptr i64 %"61" to ptr
|
||||
%"35" = getelementptr inbounds i8, ptr %"68", i64 8
|
||||
%"62" = load i32, ptr addrspace(5) %"48", align 4
|
||||
store i32 %"62", ptr %"35", align 4
|
||||
br label %"37"
|
||||
|
||||
"37": ; preds = %1
|
||||
%"45" = load i64, ptr addrspace(4) %"38", align 4
|
||||
store i64 %"45", ptr addrspace(5) %"40", align 4
|
||||
%"46" = load i64, ptr addrspace(4) %"39", align 4
|
||||
store i64 %"46", ptr addrspace(5) %"41", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"59" = inttoptr i64 %"48" to ptr
|
||||
%2 = atomicrmw uinc_wrap ptr %"59", i32 101 syncscope("agent-one-as") monotonic, align 4
|
||||
store i32 %2, ptr addrspace(5) %"42", align 4
|
||||
%"50" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"60" = inttoptr i64 %"50" to ptr addrspace(1)
|
||||
%3 = atomicrmw uinc_wrap ptr addrspace(1) %"60", i32 101 syncscope("agent-one-as") monotonic, align 4
|
||||
store i32 %3, ptr addrspace(5) %"43", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"61" = inttoptr i64 %"52" to ptr
|
||||
%"51" = load i32, ptr %"61", align 4
|
||||
store i32 %"51", ptr addrspace(5) %"44", align 4
|
||||
%"53" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"54" = load i32, ptr addrspace(5) %"42", align 4
|
||||
%"62" = inttoptr i64 %"53" to ptr
|
||||
store i32 %"54", ptr %"62", align 4
|
||||
%"55" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"63" = inttoptr i64 %"55" to ptr
|
||||
%"34" = getelementptr inbounds i8, ptr %"63", i64 4
|
||||
%"56" = load i32, ptr addrspace(5) %"43", align 4
|
||||
store i32 %"56", ptr %"34", align 4
|
||||
%"57" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"64" = inttoptr i64 %"57" to ptr
|
||||
%"36" = getelementptr inbounds i8, ptr %"64", i64 8
|
||||
%"58" = load i32, ptr addrspace(5) %"44", align 4
|
||||
store i32 %"58", ptr %"36", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,39 +1,30 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @b64tof64(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
||||
%"37" = alloca double, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
define amdgpu_kernel void @b64tof64(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
||||
%"33" = alloca double, align 8, addrspace(5)
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"41" = load double, ptr addrspace(4) %"35", align 8
|
||||
store double %"41", ptr addrspace(5) %"37", align 8
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"39", align 4
|
||||
%"44" = load double, ptr addrspace(5) %"37", align 8
|
||||
%"50" = bitcast double %"44" to i64
|
||||
store i64 %"50", ptr addrspace(5) %"38", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"51" = inttoptr i64 %"46" to ptr
|
||||
%"45" = load i64, ptr %"51", align 4
|
||||
store i64 %"45", ptr addrspace(5) %"40", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"52" = inttoptr i64 %"47" to ptr
|
||||
store i64 %"48", ptr %"52", align 4
|
||||
br label %"30"
|
||||
|
||||
"30": ; preds = %1
|
||||
%"37" = load double, ptr addrspace(4) %"31", align 8
|
||||
store double %"37", ptr addrspace(5) %"33", align 8
|
||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"38", ptr addrspace(5) %"35", align 4
|
||||
%"40" = load double, ptr addrspace(5) %"33", align 8
|
||||
%"46" = bitcast double %"40" to i64
|
||||
store i64 %"46", ptr addrspace(5) %"34", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"47" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load i64, ptr %"47", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"36", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"48" = inttoptr i64 %"43" to ptr
|
||||
store i64 %"44", ptr %"48", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
91
ptx/src/test/ll/bench.ll
Normal file
91
ptx/src/test/ll/bench.ll
Normal file
@ -0,0 +1,91 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @bench(ptr addrspace(4) byref(i64) %"55", ptr addrspace(4) byref(i64) %"56") #1 {
|
||||
%"57" = alloca i64, align 8, addrspace(5)
|
||||
%"58" = alloca i64, align 8, addrspace(5)
|
||||
%"59" = alloca float, align 4, addrspace(5)
|
||||
%"60" = alloca float, align 4, addrspace(5)
|
||||
%"61" = alloca float, align 4, addrspace(5)
|
||||
%"62" = alloca float, align 4, addrspace(5)
|
||||
%"63" = alloca i32, align 4, addrspace(5)
|
||||
%"64" = alloca i1, align 1, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"97"
|
||||
|
||||
"97": ; preds = %1
|
||||
%"65" = load i64, ptr addrspace(4) %"55", align 4
|
||||
store i64 %"65", ptr addrspace(5) %"57", align 4
|
||||
%"66" = load i64, ptr addrspace(4) %"56", align 4
|
||||
store i64 %"66", ptr addrspace(5) %"58", align 4
|
||||
%"68" = load i64, ptr addrspace(5) %"57", align 4
|
||||
%"91" = inttoptr i64 %"68" to ptr
|
||||
%"67" = load float, ptr %"91", align 4
|
||||
store float %"67", ptr addrspace(5) %"59", align 4
|
||||
%"69" = load i64, ptr addrspace(5) %"57", align 4
|
||||
%"92" = inttoptr i64 %"69" to ptr
|
||||
%"39" = getelementptr inbounds i8, ptr %"92", i64 4
|
||||
%"70" = load float, ptr %"39", align 4
|
||||
store float %"70", ptr addrspace(5) %"60", align 4
|
||||
%"71" = load i64, ptr addrspace(5) %"57", align 4
|
||||
%"93" = inttoptr i64 %"71" to ptr
|
||||
%"41" = getelementptr inbounds i8, ptr %"93", i64 8
|
||||
%"72" = load float, ptr %"41", align 4
|
||||
store float %"72", ptr addrspace(5) %"61", align 4
|
||||
%"73" = load i64, ptr addrspace(5) %"57", align 4
|
||||
%"94" = inttoptr i64 %"73" to ptr
|
||||
%"43" = getelementptr inbounds i8, ptr %"94", i64 12
|
||||
%"74" = load float, ptr %"43", align 4
|
||||
store float %"74", ptr addrspace(5) %"62", align 4
|
||||
store i32 0, ptr addrspace(5) %"63", align 4
|
||||
br label %"10"
|
||||
|
||||
"10": ; preds = %"21", %"97"
|
||||
%"77" = load float, ptr addrspace(5) %"59", align 4
|
||||
%"78" = load float, ptr addrspace(5) %"60", align 4
|
||||
call void asm sideeffect "s_denorm_mode 0", "~{mode}"()
|
||||
%"76" = fmul float %"77", %"78"
|
||||
store float %"76", ptr addrspace(5) %"59", align 4
|
||||
%"80" = load float, ptr addrspace(5) %"61", align 4
|
||||
%"81" = load float, ptr addrspace(5) %"62", align 4
|
||||
call void asm sideeffect "s_denorm_mode 11", "~{mode}"()
|
||||
%"79" = fmul float %"80", %"81"
|
||||
store float %"79", ptr addrspace(5) %"61", align 4
|
||||
%"83" = load i32, ptr addrspace(5) %"63", align 4
|
||||
%"82" = add i32 %"83", 1
|
||||
store i32 %"82", ptr addrspace(5) %"63", align 4
|
||||
%"85" = load i32, ptr addrspace(5) %"63", align 4
|
||||
%"84" = icmp eq i32 %"85", 100000000
|
||||
store i1 %"84", ptr addrspace(5) %"64", align 1
|
||||
%"86" = load i1, ptr addrspace(5) %"64", align 1
|
||||
br i1 %"86", label %"11", label %"21"
|
||||
|
||||
"21": ; preds = %"10"
|
||||
br label %"10"
|
||||
|
||||
"11": ; preds = %"10"
|
||||
%"87" = load i64, ptr addrspace(5) %"58", align 4
|
||||
%"88" = load float, ptr addrspace(5) %"59", align 4
|
||||
%"95" = inttoptr i64 %"87" to ptr
|
||||
store float %"88", ptr %"95", align 4
|
||||
%"89" = load i64, ptr addrspace(5) %"58", align 4
|
||||
%"96" = inttoptr i64 %"89" to ptr
|
||||
%"48" = getelementptr inbounds i8, ptr %"96", i64 4
|
||||
%"90" = load float, ptr addrspace(5) %"61", align 4
|
||||
store float %"90", ptr %"48", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,54 +1,46 @@
|
||||
declare i32 @__zluda_ptx_impl_bfe_u32(i32, i32, i32) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @bfe(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 {
|
||||
%"42" = alloca i64, align 8, addrspace(5)
|
||||
%"43" = alloca i64, align 8, addrspace(5)
|
||||
%"44" = alloca i32, align 4, addrspace(5)
|
||||
%"45" = alloca i32, align 4, addrspace(5)
|
||||
%"46" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @bfe(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 {
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i32, align 4, addrspace(5)
|
||||
%"41" = alloca i32, align 4, addrspace(5)
|
||||
%"42" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"47" = load i64, ptr addrspace(4) %"40", align 4
|
||||
store i64 %"47", ptr addrspace(5) %"42", align 4
|
||||
%"48" = load i64, ptr addrspace(4) %"41", align 4
|
||||
store i64 %"48", ptr addrspace(5) %"43", align 4
|
||||
%"50" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"61" = inttoptr i64 %"50" to ptr
|
||||
%"49" = load i32, ptr %"61", align 4
|
||||
store i32 %"49", ptr addrspace(5) %"44", align 4
|
||||
%"51" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"62" = inttoptr i64 %"51" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"62", i64 4
|
||||
%"52" = load i32, ptr %"31", align 4
|
||||
store i32 %"52", ptr addrspace(5) %"45", align 4
|
||||
%"53" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"63" = inttoptr i64 %"53" to ptr
|
||||
%"33" = getelementptr inbounds i8, ptr %"63", i64 8
|
||||
%"54" = load i32, ptr %"33", align 4
|
||||
store i32 %"54", ptr addrspace(5) %"46", align 4
|
||||
%"56" = load i32, ptr addrspace(5) %"44", align 4
|
||||
%"57" = load i32, ptr addrspace(5) %"45", align 4
|
||||
%"58" = load i32, ptr addrspace(5) %"46", align 4
|
||||
%"55" = call i32 @__zluda_ptx_impl_bfe_u32(i32 %"56", i32 %"57", i32 %"58")
|
||||
store i32 %"55", ptr addrspace(5) %"44", align 4
|
||||
%"59" = load i64, ptr addrspace(5) %"43", align 4
|
||||
%"60" = load i32, ptr addrspace(5) %"44", align 4
|
||||
%"64" = inttoptr i64 %"59" to ptr
|
||||
store i32 %"60", ptr %"64", align 4
|
||||
br label %"35"
|
||||
|
||||
"35": ; preds = %1
|
||||
%"43" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"39", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"57" = inttoptr i64 %"46" to ptr
|
||||
%"45" = load i32, ptr %"57", align 4
|
||||
store i32 %"45", ptr addrspace(5) %"40", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"58" = inttoptr i64 %"47" to ptr
|
||||
%"32" = getelementptr inbounds i8, ptr %"58", i64 4
|
||||
%"48" = load i32, ptr %"32", align 4
|
||||
store i32 %"48", ptr addrspace(5) %"41", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"59" = inttoptr i64 %"49" to ptr
|
||||
%"34" = getelementptr inbounds i8, ptr %"59", i64 8
|
||||
%"50" = load i32, ptr %"34", align 4
|
||||
store i32 %"50", ptr addrspace(5) %"42", align 4
|
||||
%"52" = load i32, ptr addrspace(5) %"40", align 4
|
||||
%"53" = load i32, ptr addrspace(5) %"41", align 4
|
||||
%"54" = load i32, ptr addrspace(5) %"42", align 4
|
||||
%"51" = call i32 @__zluda_ptx_impl_bfe_u32(i32 %"52", i32 %"53", i32 %"54")
|
||||
store i32 %"51", ptr addrspace(5) %"40", align 4
|
||||
%"55" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"56" = load i32, ptr addrspace(5) %"40", align 4
|
||||
%"60" = inttoptr i64 %"55" to ptr
|
||||
store i32 %"56", ptr %"60", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,61 +1,53 @@
|
||||
declare i32 @__zluda_ptx_impl_bfi_b32(i32, i32, i32, i32) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @bfi(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44") #0 {
|
||||
%"45" = alloca i64, align 8, addrspace(5)
|
||||
%"46" = alloca i64, align 8, addrspace(5)
|
||||
%"47" = alloca i32, align 4, addrspace(5)
|
||||
%"48" = alloca i32, align 4, addrspace(5)
|
||||
%"49" = alloca i32, align 4, addrspace(5)
|
||||
%"50" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @bfi(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 {
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
%"42" = alloca i64, align 8, addrspace(5)
|
||||
%"43" = alloca i32, align 4, addrspace(5)
|
||||
%"44" = alloca i32, align 4, addrspace(5)
|
||||
%"45" = alloca i32, align 4, addrspace(5)
|
||||
%"46" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"51" = load i64, ptr addrspace(4) %"43", align 4
|
||||
store i64 %"51", ptr addrspace(5) %"45", align 4
|
||||
%"52" = load i64, ptr addrspace(4) %"44", align 4
|
||||
store i64 %"52", ptr addrspace(5) %"46", align 4
|
||||
%"54" = load i64, ptr addrspace(5) %"45", align 4
|
||||
%"68" = inttoptr i64 %"54" to ptr
|
||||
%"53" = load i32, ptr %"68", align 4
|
||||
store i32 %"53", ptr addrspace(5) %"47", align 4
|
||||
%"55" = load i64, ptr addrspace(5) %"45", align 4
|
||||
%"69" = inttoptr i64 %"55" to ptr
|
||||
%"32" = getelementptr inbounds i8, ptr %"69", i64 4
|
||||
%"56" = load i32, ptr %"32", align 4
|
||||
store i32 %"56", ptr addrspace(5) %"48", align 4
|
||||
%"57" = load i64, ptr addrspace(5) %"45", align 4
|
||||
%"70" = inttoptr i64 %"57" to ptr
|
||||
%"34" = getelementptr inbounds i8, ptr %"70", i64 8
|
||||
%"58" = load i32, ptr %"34", align 4
|
||||
store i32 %"58", ptr addrspace(5) %"49", align 4
|
||||
%"59" = load i64, ptr addrspace(5) %"45", align 4
|
||||
%"71" = inttoptr i64 %"59" to ptr
|
||||
%"36" = getelementptr inbounds i8, ptr %"71", i64 12
|
||||
%"60" = load i32, ptr %"36", align 4
|
||||
store i32 %"60", ptr addrspace(5) %"50", align 4
|
||||
%"62" = load i32, ptr addrspace(5) %"47", align 4
|
||||
%"63" = load i32, ptr addrspace(5) %"48", align 4
|
||||
%"64" = load i32, ptr addrspace(5) %"49", align 4
|
||||
%"65" = load i32, ptr addrspace(5) %"50", align 4
|
||||
%"72" = call i32 @__zluda_ptx_impl_bfi_b32(i32 %"62", i32 %"63", i32 %"64", i32 %"65")
|
||||
store i32 %"72", ptr addrspace(5) %"47", align 4
|
||||
%"66" = load i64, ptr addrspace(5) %"46", align 4
|
||||
%"67" = load i32, ptr addrspace(5) %"47", align 4
|
||||
%"75" = inttoptr i64 %"66" to ptr
|
||||
store i32 %"67", ptr %"75", align 4
|
||||
br label %"38"
|
||||
|
||||
"38": ; preds = %1
|
||||
%"47" = load i64, ptr addrspace(4) %"39", align 4
|
||||
store i64 %"47", ptr addrspace(5) %"41", align 4
|
||||
%"48" = load i64, ptr addrspace(4) %"40", align 4
|
||||
store i64 %"48", ptr addrspace(5) %"42", align 4
|
||||
%"50" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"64" = inttoptr i64 %"50" to ptr
|
||||
%"49" = load i32, ptr %"64", align 4
|
||||
store i32 %"49", ptr addrspace(5) %"43", align 4
|
||||
%"51" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"65" = inttoptr i64 %"51" to ptr
|
||||
%"33" = getelementptr inbounds i8, ptr %"65", i64 4
|
||||
%"52" = load i32, ptr %"33", align 4
|
||||
store i32 %"52", ptr addrspace(5) %"44", align 4
|
||||
%"53" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"66" = inttoptr i64 %"53" to ptr
|
||||
%"35" = getelementptr inbounds i8, ptr %"66", i64 8
|
||||
%"54" = load i32, ptr %"35", align 4
|
||||
store i32 %"54", ptr addrspace(5) %"45", align 4
|
||||
%"55" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"67" = inttoptr i64 %"55" to ptr
|
||||
%"37" = getelementptr inbounds i8, ptr %"67", i64 12
|
||||
%"56" = load i32, ptr %"37", align 4
|
||||
store i32 %"56", ptr addrspace(5) %"46", align 4
|
||||
%"58" = load i32, ptr addrspace(5) %"43", align 4
|
||||
%"59" = load i32, ptr addrspace(5) %"44", align 4
|
||||
%"60" = load i32, ptr addrspace(5) %"45", align 4
|
||||
%"61" = load i32, ptr addrspace(5) %"46", align 4
|
||||
%"68" = call i32 @__zluda_ptx_impl_bfi_b32(i32 %"58", i32 %"59", i32 %"60", i32 %"61")
|
||||
store i32 %"68", ptr addrspace(5) %"43", align 4
|
||||
%"62" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"63" = load i32, ptr addrspace(5) %"43", align 4
|
||||
%"71" = inttoptr i64 %"62" to ptr
|
||||
store i32 %"63", ptr %"71", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,43 +1,34 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @block(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 {
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
%"42" = alloca i64, align 8, addrspace(5)
|
||||
%"43" = alloca i64, align 8, addrspace(5)
|
||||
%"50" = alloca i64, align 8, addrspace(5)
|
||||
define amdgpu_kernel void @block(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"46" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"44" = load i64, ptr addrspace(4) %"38", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"45" = load i64, ptr addrspace(4) %"39", align 4
|
||||
store i64 %"45", ptr addrspace(5) %"41", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"55" = inttoptr i64 %"47" to ptr
|
||||
%"46" = load i64, ptr %"55", align 4
|
||||
store i64 %"46", ptr addrspace(5) %"42", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"48" = add i64 %"49", 1
|
||||
store i64 %"48", ptr addrspace(5) %"43", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"50", align 4
|
||||
%"51" = add i64 %"52", 1
|
||||
store i64 %"51", ptr addrspace(5) %"50", align 4
|
||||
%"53" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"54" = load i64, ptr addrspace(5) %"43", align 4
|
||||
%"56" = inttoptr i64 %"53" to ptr
|
||||
store i64 %"54", ptr %"56", align 4
|
||||
br label %"33"
|
||||
|
||||
"33": ; preds = %1
|
||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"51" = inttoptr i64 %"43" to ptr
|
||||
%"42" = load i64, ptr %"51", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"44" = add i64 %"45", 1
|
||||
store i64 %"44", ptr addrspace(5) %"39", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"46", align 4
|
||||
%"47" = add i64 %"48", 1
|
||||
store i64 %"47", ptr addrspace(5) %"46", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"50" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"52" = inttoptr i64 %"49" to ptr
|
||||
store i64 %"50", ptr %"52", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,51 +1,36 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @bra(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 {
|
||||
%"42" = alloca i64, align 8, addrspace(5)
|
||||
%"43" = alloca i64, align 8, addrspace(5)
|
||||
%"44" = alloca i64, align 8, addrspace(5)
|
||||
%"45" = alloca i64, align 8, addrspace(5)
|
||||
define amdgpu_kernel void @bra(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"46" = load i64, ptr addrspace(4) %"40", align 4
|
||||
store i64 %"46", ptr addrspace(5) %"42", align 4
|
||||
%"47" = load i64, ptr addrspace(4) %"41", align 4
|
||||
store i64 %"47", ptr addrspace(5) %"43", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"56" = inttoptr i64 %"49" to ptr
|
||||
%"48" = load i64, ptr %"56", align 4
|
||||
store i64 %"48", ptr addrspace(5) %"44", align 4
|
||||
br label %"9"
|
||||
br label %"35"
|
||||
|
||||
"9": ; preds = %1
|
||||
%"51" = load i64, ptr addrspace(5) %"44", align 4
|
||||
%"50" = add i64 %"51", 1
|
||||
store i64 %"50", ptr addrspace(5) %"45", align 4
|
||||
br label %"11"
|
||||
"35": ; preds = %1
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"50" = inttoptr i64 %"45" to ptr
|
||||
%"44" = load i64, ptr %"50", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
||||
br label %"10"
|
||||
|
||||
"10": ; No predecessors!
|
||||
%"53" = load i64, ptr addrspace(5) %"44", align 4
|
||||
%"52" = add i64 %"53", 2
|
||||
store i64 %"52", ptr addrspace(5) %"45", align 4
|
||||
br label %"11"
|
||||
"10": ; preds = %"35"
|
||||
%"47" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"46" = add i64 %"47", 1
|
||||
store i64 %"46", ptr addrspace(5) %"41", align 4
|
||||
br label %"12"
|
||||
|
||||
"11": ; preds = %"10", %"9"
|
||||
%"54" = load i64, ptr addrspace(5) %"43", align 4
|
||||
%"55" = load i64, ptr addrspace(5) %"45", align 4
|
||||
%"57" = inttoptr i64 %"54" to ptr
|
||||
store i64 %"55", ptr %"57", align 4
|
||||
"12": ; preds = %"10"
|
||||
%"48" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"51" = inttoptr i64 %"48" to ptr
|
||||
store i64 %"49", ptr %"51", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,42 +1,33 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @brev(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @brev(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
||||
%"32" = alloca i64, align 8, addrspace(5)
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"39" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"36", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"47" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load i32, ptr %"47", align 4
|
||||
store i32 %"41", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load i32, ptr addrspace(5) %"38", align 4
|
||||
%"43" = call i32 @llvm.bitreverse.i32(i32 %"44")
|
||||
store i32 %"43", ptr addrspace(5) %"38", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"46" = load i32, ptr addrspace(5) %"38", align 4
|
||||
%"48" = inttoptr i64 %"45" to ptr
|
||||
store i32 %"46", ptr %"48", align 4
|
||||
br label %"29"
|
||||
|
||||
"29": ; preds = %1
|
||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
||||
%"43" = inttoptr i64 %"38" to ptr
|
||||
%"37" = load i32, ptr %"43", align 4
|
||||
store i32 %"37", ptr addrspace(5) %"34", align 4
|
||||
%"40" = load i32, ptr addrspace(5) %"34", align 4
|
||||
%"39" = call i32 @llvm.bitreverse.i32(i32 %"40")
|
||||
store i32 %"39", ptr addrspace(5) %"34", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"42" = load i32, ptr addrspace(5) %"34", align 4
|
||||
%"44" = inttoptr i64 %"41" to ptr
|
||||
store i32 %"42", ptr %"44", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare i32 @llvm.bitreverse.i32(i32) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
@ -1,66 +1,64 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define i64 @__zluda_ptx_impl_incr(i64 %"42") #0 {
|
||||
define i64 @incr(i64 %"43") #0 {
|
||||
%"63" = alloca i64, align 8, addrspace(5)
|
||||
%"64" = alloca i64, align 8, addrspace(5)
|
||||
%"65" = alloca i64, align 8, addrspace(5)
|
||||
%"66" = alloca i64, align 8, addrspace(5)
|
||||
%"67" = alloca i64, align 8, addrspace(5)
|
||||
%"68" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
store i64 %"42", ptr addrspace(5) %"67", align 4
|
||||
%"69" = load i64, ptr addrspace(5) %"67", align 4
|
||||
store i64 %"69", ptr addrspace(5) %"68", align 4
|
||||
%"71" = load i64, ptr addrspace(5) %"68", align 4
|
||||
%"70" = add i64 %"71", 1
|
||||
store i64 %"70", ptr addrspace(5) %"68", align 4
|
||||
%"72" = load i64, ptr addrspace(5) %"68", align 4
|
||||
store i64 %"72", ptr addrspace(5) %"66", align 4
|
||||
%"73" = load i64, ptr addrspace(5) %"66", align 4
|
||||
store i64 %"73", ptr addrspace(5) %"65", align 4
|
||||
%2 = load i64, ptr addrspace(5) %"65", align 4
|
||||
br label %"46"
|
||||
|
||||
"46": ; preds = %1
|
||||
store i64 %"43", ptr addrspace(5) %"65", align 4
|
||||
%"67" = load i64, ptr addrspace(5) %"65", align 4
|
||||
store i64 %"67", ptr addrspace(5) %"66", align 4
|
||||
%"69" = load i64, ptr addrspace(5) %"66", align 4
|
||||
%"68" = add i64 %"69", 1
|
||||
store i64 %"68", ptr addrspace(5) %"66", align 4
|
||||
%"70" = load i64, ptr addrspace(5) %"66", align 4
|
||||
store i64 %"70", ptr addrspace(5) %"64", align 4
|
||||
%"71" = load i64, ptr addrspace(5) %"64", align 4
|
||||
store i64 %"71", ptr addrspace(5) %"63", align 4
|
||||
%2 = load i64, ptr addrspace(5) %"63", align 4
|
||||
ret i64 %2
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @call(ptr addrspace(4) byref(i64) %"50", ptr addrspace(4) byref(i64) %"51") #0 {
|
||||
define amdgpu_kernel void @call(ptr addrspace(4) byref(i64) %"48", ptr addrspace(4) byref(i64) %"49") #1 {
|
||||
%"50" = alloca i64, align 8, addrspace(5)
|
||||
%"51" = alloca i64, align 8, addrspace(5)
|
||||
%"52" = alloca i64, align 8, addrspace(5)
|
||||
%"53" = alloca i64, align 8, addrspace(5)
|
||||
%"54" = alloca i64, align 8, addrspace(5)
|
||||
%"59" = alloca i64, align 8, addrspace(5)
|
||||
%"60" = alloca i64, align 8, addrspace(5)
|
||||
%"57" = alloca i64, align 8, addrspace(5)
|
||||
%"58" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"55" = load i64, ptr addrspace(4) %"50", align 4
|
||||
br label %"44"
|
||||
|
||||
"44": ; preds = %1
|
||||
%"53" = load i64, ptr addrspace(4) %"48", align 4
|
||||
store i64 %"53", ptr addrspace(5) %"50", align 4
|
||||
%"54" = load i64, ptr addrspace(4) %"49", align 4
|
||||
store i64 %"54", ptr addrspace(5) %"51", align 4
|
||||
%"56" = load i64, ptr addrspace(5) %"50", align 4
|
||||
%"72" = inttoptr i64 %"56" to ptr addrspace(1)
|
||||
%"55" = load i64, ptr addrspace(1) %"72", align 4
|
||||
store i64 %"55", ptr addrspace(5) %"52", align 4
|
||||
%"56" = load i64, ptr addrspace(4) %"51", align 4
|
||||
store i64 %"56", ptr addrspace(5) %"53", align 4
|
||||
%"58" = load i64, ptr addrspace(5) %"52", align 4
|
||||
%"74" = inttoptr i64 %"58" to ptr addrspace(1)
|
||||
%"57" = load i64, ptr addrspace(1) %"74", align 4
|
||||
store i64 %"57", ptr addrspace(5) %"54", align 4
|
||||
%"61" = load i64, ptr addrspace(5) %"54", align 4
|
||||
store i64 %"61", ptr addrspace(5) %"59", align 4
|
||||
%"39" = load i64, ptr addrspace(5) %"59", align 4
|
||||
%"40" = call i64 @__zluda_ptx_impl_incr(i64 %"39")
|
||||
store i64 %"40", ptr addrspace(5) %"60", align 4
|
||||
%"62" = load i64, ptr addrspace(5) %"60", align 4
|
||||
store i64 %"62", ptr addrspace(5) %"54", align 4
|
||||
%"63" = load i64, ptr addrspace(5) %"53", align 4
|
||||
%"64" = load i64, ptr addrspace(5) %"54", align 4
|
||||
%"77" = inttoptr i64 %"63" to ptr addrspace(1)
|
||||
store i64 %"64", ptr addrspace(1) %"77", align 4
|
||||
%"59" = load i64, ptr addrspace(5) %"52", align 4
|
||||
store i64 %"59", ptr addrspace(5) %"57", align 4
|
||||
%"40" = load i64, ptr addrspace(5) %"57", align 4
|
||||
%"41" = call i64 @incr(i64 %"40")
|
||||
br label %"45"
|
||||
|
||||
"45": ; preds = %"44"
|
||||
store i64 %"41", ptr addrspace(5) %"58", align 4
|
||||
%"60" = load i64, ptr addrspace(5) %"58", align 4
|
||||
store i64 %"60", ptr addrspace(5) %"52", align 4
|
||||
%"61" = load i64, ptr addrspace(5) %"51", align 4
|
||||
%"62" = load i64, ptr addrspace(5) %"52", align 4
|
||||
%"75" = inttoptr i64 %"61" to ptr addrspace(1)
|
||||
store i64 %"62", ptr addrspace(1) %"75", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
155
ptx/src/test/ll/call_rnd.ll
Normal file
155
ptx/src/test/ll/call_rnd.ll
Normal file
@ -0,0 +1,155 @@
|
||||
define float @add_rm(float %"79", float %"80") #0 {
|
||||
%"128" = alloca float, align 4, addrspace(5)
|
||||
%"129" = alloca float, align 4, addrspace(5)
|
||||
%"130" = alloca float, align 4, addrspace(5)
|
||||
%"131" = alloca float, align 4, addrspace(5)
|
||||
%"132" = alloca float, align 4, addrspace(5)
|
||||
%"133" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"89"
|
||||
|
||||
"89": ; preds = %1
|
||||
call void @llvm.amdgcn.s.setreg(i32 6145, i32 2)
|
||||
br label %"87"
|
||||
|
||||
"87": ; preds = %"89"
|
||||
store float %"79", ptr addrspace(5) %"130", align 4
|
||||
store float %"80", ptr addrspace(5) %"131", align 4
|
||||
%"134" = load float, ptr addrspace(5) %"130", align 4
|
||||
store float %"134", ptr addrspace(5) %"132", align 4
|
||||
%"135" = load float, ptr addrspace(5) %"131", align 4
|
||||
store float %"135", ptr addrspace(5) %"133", align 4
|
||||
%"137" = load float, ptr addrspace(5) %"132", align 4
|
||||
%"138" = load float, ptr addrspace(5) %"133", align 4
|
||||
%"136" = fadd float %"137", %"138"
|
||||
store float %"136", ptr addrspace(5) %"132", align 4
|
||||
%"139" = load float, ptr addrspace(5) %"132", align 4
|
||||
store float %"139", ptr addrspace(5) %"129", align 4
|
||||
%"140" = load float, ptr addrspace(5) %"129", align 4
|
||||
store float %"140", ptr addrspace(5) %"128", align 4
|
||||
%2 = load float, ptr addrspace(5) %"128", align 4
|
||||
ret float %2
|
||||
}
|
||||
|
||||
define float @add_rp(float %"82", float %"83") #0 {
|
||||
%"141" = alloca float, align 4, addrspace(5)
|
||||
%"142" = alloca float, align 4, addrspace(5)
|
||||
%"143" = alloca float, align 4, addrspace(5)
|
||||
%"144" = alloca float, align 4, addrspace(5)
|
||||
%"145" = alloca float, align 4, addrspace(5)
|
||||
%"146" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"88"
|
||||
|
||||
"88": ; preds = %1
|
||||
store float %"82", ptr addrspace(5) %"143", align 4
|
||||
store float %"83", ptr addrspace(5) %"144", align 4
|
||||
%"147" = load float, ptr addrspace(5) %"143", align 4
|
||||
store float %"147", ptr addrspace(5) %"145", align 4
|
||||
%"148" = load float, ptr addrspace(5) %"144", align 4
|
||||
store float %"148", ptr addrspace(5) %"146", align 4
|
||||
%"150" = load float, ptr addrspace(5) %"145", align 4
|
||||
%"151" = load float, ptr addrspace(5) %"146", align 4
|
||||
%"149" = fadd float %"150", %"151"
|
||||
store float %"149", ptr addrspace(5) %"145", align 4
|
||||
%"152" = load float, ptr addrspace(5) %"145", align 4
|
||||
store float %"152", ptr addrspace(5) %"142", align 4
|
||||
%"153" = load float, ptr addrspace(5) %"142", align 4
|
||||
store float %"153", ptr addrspace(5) %"141", align 4
|
||||
%2 = load float, ptr addrspace(5) %"141", align 4
|
||||
ret float %2
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @call_rnd(ptr addrspace(4) byref(i64) %"92", ptr addrspace(4) byref(i64) %"93") #1 {
|
||||
%"94" = alloca i64, align 8, addrspace(5)
|
||||
%"95" = alloca i64, align 8, addrspace(5)
|
||||
%"96" = alloca float, align 4, addrspace(5)
|
||||
%"97" = alloca float, align 4, addrspace(5)
|
||||
%"98" = alloca float, align 4, addrspace(5)
|
||||
%"99" = alloca float, align 4, addrspace(5)
|
||||
%"100" = alloca float, align 4, addrspace(5)
|
||||
%"101" = alloca float, align 4, addrspace(5)
|
||||
%"102" = alloca float, align 4, addrspace(5)
|
||||
%"103" = alloca float, align 4, addrspace(5)
|
||||
%"104" = alloca float, align 4, addrspace(5)
|
||||
%"105" = alloca float, align 4, addrspace(5)
|
||||
%"106" = alloca float, align 4, addrspace(5)
|
||||
%"107" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"84"
|
||||
|
||||
"84": ; preds = %1
|
||||
call void @llvm.amdgcn.s.setreg(i32 6145, i32 1)
|
||||
%"108" = load i64, ptr addrspace(4) %"92", align 4
|
||||
store i64 %"108", ptr addrspace(5) %"94", align 4
|
||||
%"109" = load i64, ptr addrspace(4) %"93", align 4
|
||||
store i64 %"109", ptr addrspace(5) %"95", align 4
|
||||
%"111" = load i64, ptr addrspace(5) %"94", align 4
|
||||
%"154" = inttoptr i64 %"111" to ptr
|
||||
%"110" = load float, ptr %"154", align 4
|
||||
store float %"110", ptr addrspace(5) %"96", align 4
|
||||
%"112" = load i64, ptr addrspace(5) %"94", align 4
|
||||
%"155" = inttoptr i64 %"112" to ptr
|
||||
%"59" = getelementptr inbounds i8, ptr %"155", i64 4
|
||||
%"113" = load float, ptr %"59", align 4
|
||||
store float %"113", ptr addrspace(5) %"97", align 4
|
||||
%"114" = load i64, ptr addrspace(5) %"94", align 4
|
||||
%"156" = inttoptr i64 %"114" to ptr
|
||||
%"61" = getelementptr inbounds i8, ptr %"156", i64 8
|
||||
%"115" = load float, ptr %"61", align 4
|
||||
store float %"115", ptr addrspace(5) %"98", align 4
|
||||
%"116" = load i64, ptr addrspace(5) %"94", align 4
|
||||
%"157" = inttoptr i64 %"116" to ptr
|
||||
%"63" = getelementptr inbounds i8, ptr %"157", i64 12
|
||||
%"117" = load float, ptr %"63", align 4
|
||||
store float %"117", ptr addrspace(5) %"99", align 4
|
||||
%"118" = load float, ptr addrspace(5) %"96", align 4
|
||||
store float %"118", ptr addrspace(5) %"102", align 4
|
||||
%"119" = load float, ptr addrspace(5) %"97", align 4
|
||||
store float %"119", ptr addrspace(5) %"103", align 4
|
||||
%"72" = load float, ptr addrspace(5) %"102", align 4
|
||||
%"73" = load float, ptr addrspace(5) %"103", align 4
|
||||
%"74" = call float @add_rp(float %"72", float %"73")
|
||||
br label %"85"
|
||||
|
||||
"85": ; preds = %"84"
|
||||
store float %"74", ptr addrspace(5) %"104", align 4
|
||||
%"120" = load float, ptr addrspace(5) %"104", align 4
|
||||
store float %"120", ptr addrspace(5) %"100", align 4
|
||||
%"121" = load i64, ptr addrspace(5) %"95", align 4
|
||||
%"122" = load float, ptr addrspace(5) %"100", align 4
|
||||
%"158" = inttoptr i64 %"121" to ptr
|
||||
store float %"122", ptr %"158", align 4
|
||||
%"123" = load float, ptr addrspace(5) %"98", align 4
|
||||
store float %"123", ptr addrspace(5) %"105", align 4
|
||||
%"124" = load float, ptr addrspace(5) %"99", align 4
|
||||
store float %"124", ptr addrspace(5) %"106", align 4
|
||||
%"75" = load float, ptr addrspace(5) %"105", align 4
|
||||
%"76" = load float, ptr addrspace(5) %"106", align 4
|
||||
%"77" = call float @add_rm(float %"75", float %"76")
|
||||
br label %"86"
|
||||
|
||||
"86": ; preds = %"85"
|
||||
store float %"77", ptr addrspace(5) %"107", align 4
|
||||
%"125" = load float, ptr addrspace(5) %"107", align 4
|
||||
store float %"125", ptr addrspace(5) %"101", align 4
|
||||
%"126" = load i64, ptr addrspace(5) %"95", align 4
|
||||
%"159" = inttoptr i64 %"126" to ptr
|
||||
%"65" = getelementptr inbounds i8, ptr %"159", i64 4
|
||||
%"127" = load float, ptr addrspace(5) %"101", align 4
|
||||
store float %"127", ptr %"65", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind willreturn
|
||||
declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #2
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #2 = { nocallback nofree nosync nounwind willreturn }
|
@ -1,42 +1,33 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @clz(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @clz(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
||||
%"32" = alloca i64, align 8, addrspace(5)
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"39" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"36", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"47" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load i32, ptr %"47", align 4
|
||||
store i32 %"41", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load i32, ptr addrspace(5) %"38", align 4
|
||||
%"48" = call i32 @llvm.ctlz.i32(i32 %"44", i1 false)
|
||||
store i32 %"48", ptr addrspace(5) %"38", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"46" = load i32, ptr addrspace(5) %"38", align 4
|
||||
%"49" = inttoptr i64 %"45" to ptr
|
||||
store i32 %"46", ptr %"49", align 4
|
||||
br label %"29"
|
||||
|
||||
"29": ; preds = %1
|
||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
||||
%"43" = inttoptr i64 %"38" to ptr
|
||||
%"37" = load i32, ptr %"43", align 4
|
||||
store i32 %"37", ptr addrspace(5) %"34", align 4
|
||||
%"40" = load i32, ptr addrspace(5) %"34", align 4
|
||||
%"44" = call i32 @llvm.ctlz.i32(i32 %"40", i1 false)
|
||||
store i32 %"44", ptr addrspace(5) %"34", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"42" = load i32, ptr addrspace(5) %"34", align 4
|
||||
%"45" = inttoptr i64 %"41" to ptr
|
||||
store i32 %"42", ptr %"45", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare i32 @llvm.ctlz.i32(i32, i1 immarg) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
@ -1,59 +1,50 @@
|
||||
@constparams = addrspace(4) global [4 x i16] [i16 10, i16 20, i16 30, i16 40], align 8
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @const(ptr addrspace(4) byref(i64) %"50", ptr addrspace(4) byref(i64) %"51") #0 {
|
||||
%"52" = alloca i64, align 8, addrspace(5)
|
||||
%"53" = alloca i64, align 8, addrspace(5)
|
||||
%"54" = alloca i16, align 2, addrspace(5)
|
||||
%"55" = alloca i16, align 2, addrspace(5)
|
||||
%"56" = alloca i16, align 2, addrspace(5)
|
||||
%"57" = alloca i16, align 2, addrspace(5)
|
||||
define amdgpu_kernel void @const(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #0 {
|
||||
%"48" = alloca i64, align 8, addrspace(5)
|
||||
%"49" = alloca i64, align 8, addrspace(5)
|
||||
%"50" = alloca i16, align 2, addrspace(5)
|
||||
%"51" = alloca i16, align 2, addrspace(5)
|
||||
%"52" = alloca i16, align 2, addrspace(5)
|
||||
%"53" = alloca i16, align 2, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"58" = load i64, ptr addrspace(4) %"50", align 4
|
||||
store i64 %"58", ptr addrspace(5) %"52", align 4
|
||||
%"59" = load i64, ptr addrspace(4) %"51", align 4
|
||||
store i64 %"59", ptr addrspace(5) %"53", align 4
|
||||
%"60" = load i16, ptr addrspace(4) @constparams, align 2
|
||||
store i16 %"60", ptr addrspace(5) %"54", align 2
|
||||
%"61" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 2), align 2
|
||||
store i16 %"61", ptr addrspace(5) %"55", align 2
|
||||
%"62" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 4), align 2
|
||||
store i16 %"62", ptr addrspace(5) %"56", align 2
|
||||
%"63" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 6), align 2
|
||||
store i16 %"63", ptr addrspace(5) %"57", align 2
|
||||
%"64" = load i64, ptr addrspace(5) %"53", align 4
|
||||
%"65" = load i16, ptr addrspace(5) %"54", align 2
|
||||
br label %"45"
|
||||
|
||||
"45": ; preds = %1
|
||||
%"54" = load i64, ptr addrspace(4) %"46", align 4
|
||||
store i64 %"54", ptr addrspace(5) %"48", align 4
|
||||
%"55" = load i64, ptr addrspace(4) %"47", align 4
|
||||
store i64 %"55", ptr addrspace(5) %"49", align 4
|
||||
%"56" = load i16, ptr addrspace(4) @constparams, align 2
|
||||
store i16 %"56", ptr addrspace(5) %"50", align 2
|
||||
%"57" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 2), align 2
|
||||
store i16 %"57", ptr addrspace(5) %"51", align 2
|
||||
%"58" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 4), align 2
|
||||
store i16 %"58", ptr addrspace(5) %"52", align 2
|
||||
%"59" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 6), align 2
|
||||
store i16 %"59", ptr addrspace(5) %"53", align 2
|
||||
%"60" = load i64, ptr addrspace(5) %"49", align 4
|
||||
%"61" = load i16, ptr addrspace(5) %"50", align 2
|
||||
%"72" = inttoptr i64 %"60" to ptr
|
||||
store i16 %"61", ptr %"72", align 2
|
||||
%"62" = load i64, ptr addrspace(5) %"49", align 4
|
||||
%"74" = inttoptr i64 %"62" to ptr
|
||||
%"40" = getelementptr inbounds i8, ptr %"74", i64 2
|
||||
%"63" = load i16, ptr addrspace(5) %"51", align 2
|
||||
store i16 %"63", ptr %"40", align 2
|
||||
%"64" = load i64, ptr addrspace(5) %"49", align 4
|
||||
%"76" = inttoptr i64 %"64" to ptr
|
||||
store i16 %"65", ptr %"76", align 2
|
||||
%"66" = load i64, ptr addrspace(5) %"53", align 4
|
||||
%"42" = getelementptr inbounds i8, ptr %"76", i64 4
|
||||
%"65" = load i16, ptr addrspace(5) %"52", align 2
|
||||
store i16 %"65", ptr %"42", align 2
|
||||
%"66" = load i64, ptr addrspace(5) %"49", align 4
|
||||
%"78" = inttoptr i64 %"66" to ptr
|
||||
%"39" = getelementptr inbounds i8, ptr %"78", i64 2
|
||||
%"67" = load i16, ptr addrspace(5) %"55", align 2
|
||||
store i16 %"67", ptr %"39", align 2
|
||||
%"68" = load i64, ptr addrspace(5) %"53", align 4
|
||||
%"80" = inttoptr i64 %"68" to ptr
|
||||
%"41" = getelementptr inbounds i8, ptr %"80", i64 4
|
||||
%"69" = load i16, ptr addrspace(5) %"56", align 2
|
||||
store i16 %"69", ptr %"41", align 2
|
||||
%"70" = load i64, ptr addrspace(5) %"53", align 4
|
||||
%"82" = inttoptr i64 %"70" to ptr
|
||||
%"43" = getelementptr inbounds i8, ptr %"82", i64 6
|
||||
%"71" = load i16, ptr addrspace(5) %"57", align 2
|
||||
store i16 %"71", ptr %"43", align 2
|
||||
%"44" = getelementptr inbounds i8, ptr %"78", i64 6
|
||||
%"67" = load i16, ptr addrspace(5) %"53", align 2
|
||||
store i16 %"67", ptr %"44", align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,38 +1,29 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @constant_f32(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca float, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @constant_f32(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"40" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"37", align 4
|
||||
%"41" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"38", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"48" = inttoptr i64 %"43" to ptr
|
||||
%"42" = load float, ptr %"48", align 4
|
||||
store float %"42", ptr addrspace(5) %"39", align 4
|
||||
%"45" = load float, ptr addrspace(5) %"39", align 4
|
||||
%"44" = fmul float %"45", 5.000000e-01
|
||||
store float %"44", ptr addrspace(5) %"39", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"47" = load float, ptr addrspace(5) %"39", align 4
|
||||
%"49" = inttoptr i64 %"46" to ptr
|
||||
store float %"47", ptr %"49", align 4
|
||||
br label %"30"
|
||||
|
||||
"30": ; preds = %1
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"37" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"37", ptr addrspace(5) %"34", align 4
|
||||
%"39" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"44" = inttoptr i64 %"39" to ptr
|
||||
%"38" = load float, ptr %"44", align 4
|
||||
store float %"38", ptr addrspace(5) %"35", align 4
|
||||
%"41" = load float, ptr addrspace(5) %"35", align 4
|
||||
%"40" = fmul float %"41", 5.000000e-01
|
||||
store float %"40", ptr addrspace(5) %"35", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"43" = load float, ptr addrspace(5) %"35", align 4
|
||||
%"45" = inttoptr i64 %"42" to ptr
|
||||
store float %"43", ptr %"45", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,38 +1,29 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @constant_negative(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @constant_negative(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"40" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"37", align 4
|
||||
%"41" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"38", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"48" = inttoptr i64 %"43" to ptr
|
||||
%"42" = load i32, ptr %"48", align 4
|
||||
store i32 %"42", ptr addrspace(5) %"39", align 4
|
||||
%"45" = load i32, ptr addrspace(5) %"39", align 4
|
||||
%"44" = mul i32 %"45", -1
|
||||
store i32 %"44", ptr addrspace(5) %"39", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"47" = load i32, ptr addrspace(5) %"39", align 4
|
||||
%"49" = inttoptr i64 %"46" to ptr
|
||||
store i32 %"47", ptr %"49", align 4
|
||||
br label %"30"
|
||||
|
||||
"30": ; preds = %1
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"37" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"37", ptr addrspace(5) %"34", align 4
|
||||
%"39" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"44" = inttoptr i64 %"39" to ptr
|
||||
%"38" = load i32, ptr %"44", align 4
|
||||
store i32 %"38", ptr addrspace(5) %"35", align 4
|
||||
%"41" = load i32, ptr addrspace(5) %"35", align 4
|
||||
%"40" = mul i32 %"41", -1
|
||||
store i32 %"40", ptr addrspace(5) %"35", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"43" = load i32, ptr addrspace(5) %"35", align 4
|
||||
%"45" = inttoptr i64 %"42" to ptr
|
||||
store i32 %"43", ptr %"45", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,42 +1,33 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @cos(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca float, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @cos(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
||||
%"32" = alloca i64, align 8, addrspace(5)
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"39" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"36", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"47" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load float, ptr %"47", align 4
|
||||
store float %"41", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load float, ptr addrspace(5) %"38", align 4
|
||||
%"43" = call afn float @llvm.cos.f32(float %"44")
|
||||
store float %"43", ptr addrspace(5) %"38", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"46" = load float, ptr addrspace(5) %"38", align 4
|
||||
%"48" = inttoptr i64 %"45" to ptr
|
||||
store float %"46", ptr %"48", align 4
|
||||
br label %"29"
|
||||
|
||||
"29": ; preds = %1
|
||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
||||
%"43" = inttoptr i64 %"38" to ptr
|
||||
%"37" = load float, ptr %"43", align 4
|
||||
store float %"37", ptr addrspace(5) %"34", align 4
|
||||
%"40" = load float, ptr addrspace(5) %"34", align 4
|
||||
%"39" = call afn float @llvm.cos.f32(float %"40")
|
||||
store float %"39", ptr addrspace(5) %"34", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"42" = load float, ptr addrspace(5) %"34", align 4
|
||||
%"44" = inttoptr i64 %"41" to ptr
|
||||
store float %"42", ptr %"44", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare float @llvm.cos.f32(float) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
@ -1,39 +1,30 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @cvt_f64_f32(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca float, align 4, addrspace(5)
|
||||
%"40" = alloca double, align 8, addrspace(5)
|
||||
define amdgpu_kernel void @cvt_f64_f32(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca float, align 4, addrspace(5)
|
||||
%"36" = alloca double, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"49" = inttoptr i64 %"44" to ptr addrspace(1)
|
||||
%"43" = load float, ptr addrspace(1) %"49", align 4
|
||||
store float %"43", ptr addrspace(5) %"39", align 4
|
||||
%"46" = load float, ptr addrspace(5) %"39", align 4
|
||||
%"45" = fpext float %"46" to double
|
||||
store double %"45", ptr addrspace(5) %"40", align 8
|
||||
%"47" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"48" = load double, ptr addrspace(5) %"40", align 8
|
||||
%"50" = inttoptr i64 %"47" to ptr
|
||||
store double %"48", ptr %"50", align 8
|
||||
br label %"30"
|
||||
|
||||
"30": ; preds = %1
|
||||
%"37" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"37", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
||||
%"40" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"45" = inttoptr i64 %"40" to ptr addrspace(1)
|
||||
%"39" = load float, ptr addrspace(1) %"45", align 4
|
||||
store float %"39", ptr addrspace(5) %"35", align 4
|
||||
%"42" = load float, ptr addrspace(5) %"35", align 4
|
||||
%"41" = fpext float %"42" to double
|
||||
store double %"41", ptr addrspace(5) %"36", align 8
|
||||
%"43" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"44" = load double, ptr addrspace(5) %"36", align 8
|
||||
%"46" = inttoptr i64 %"43" to ptr
|
||||
store double %"44", ptr %"46", align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,58 +1,49 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @cvt_rni(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 {
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
%"42" = alloca i64, align 8, addrspace(5)
|
||||
%"43" = alloca float, align 4, addrspace(5)
|
||||
%"44" = alloca float, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @cvt_rni(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca float, align 4, addrspace(5)
|
||||
%"40" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"45" = load i64, ptr addrspace(4) %"39", align 4
|
||||
store i64 %"45", ptr addrspace(5) %"41", align 4
|
||||
%"46" = load i64, ptr addrspace(4) %"40", align 4
|
||||
store i64 %"46", ptr addrspace(5) %"42", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"59" = inttoptr i64 %"48" to ptr
|
||||
%"47" = load float, ptr %"59", align 4
|
||||
store float %"47", ptr addrspace(5) %"43", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"60" = inttoptr i64 %"49" to ptr
|
||||
%"30" = getelementptr inbounds i8, ptr %"60", i64 4
|
||||
%"50" = load float, ptr %"30", align 4
|
||||
store float %"50", ptr addrspace(5) %"44", align 4
|
||||
%"52" = load float, ptr addrspace(5) %"43", align 4
|
||||
%2 = call float @llvm.roundeven.f32(float %"52")
|
||||
%"51" = freeze float %2
|
||||
store float %"51", ptr addrspace(5) %"43", align 4
|
||||
%"54" = load float, ptr addrspace(5) %"44", align 4
|
||||
%3 = call float @llvm.roundeven.f32(float %"54")
|
||||
%"53" = freeze float %3
|
||||
store float %"53", ptr addrspace(5) %"44", align 4
|
||||
%"55" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"56" = load float, ptr addrspace(5) %"43", align 4
|
||||
%"61" = inttoptr i64 %"55" to ptr
|
||||
store float %"56", ptr %"61", align 4
|
||||
%"57" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"62" = inttoptr i64 %"57" to ptr
|
||||
%"32" = getelementptr inbounds i8, ptr %"62", i64 4
|
||||
%"58" = load float, ptr addrspace(5) %"44", align 4
|
||||
store float %"58", ptr %"32", align 4
|
||||
br label %"34"
|
||||
|
||||
"34": ; preds = %1
|
||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"55" = inttoptr i64 %"44" to ptr
|
||||
%"43" = load float, ptr %"55", align 4
|
||||
store float %"43", ptr addrspace(5) %"39", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"56" = inttoptr i64 %"45" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"56", i64 4
|
||||
%"46" = load float, ptr %"31", align 4
|
||||
store float %"46", ptr addrspace(5) %"40", align 4
|
||||
%"48" = load float, ptr addrspace(5) %"39", align 4
|
||||
%2 = call float @llvm.roundeven.f32(float %"48")
|
||||
%"47" = freeze float %2
|
||||
store float %"47", ptr addrspace(5) %"39", align 4
|
||||
%"50" = load float, ptr addrspace(5) %"40", align 4
|
||||
%3 = call float @llvm.roundeven.f32(float %"50")
|
||||
%"49" = freeze float %3
|
||||
store float %"49", ptr addrspace(5) %"40", align 4
|
||||
%"51" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"52" = load float, ptr addrspace(5) %"39", align 4
|
||||
%"57" = inttoptr i64 %"51" to ptr
|
||||
store float %"52", ptr %"57", align 4
|
||||
%"53" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"58" = inttoptr i64 %"53" to ptr
|
||||
%"33" = getelementptr inbounds i8, ptr %"58", i64 4
|
||||
%"54" = load float, ptr addrspace(5) %"40", align 4
|
||||
store float %"54", ptr %"33", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare float @llvm.roundeven.f32(float) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
@ -1,58 +1,54 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @cvt_rzi(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 {
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
%"42" = alloca i64, align 8, addrspace(5)
|
||||
%"43" = alloca float, align 4, addrspace(5)
|
||||
%"44" = alloca float, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @cvt_rzi(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca float, align 4, addrspace(5)
|
||||
%"40" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"45" = load i64, ptr addrspace(4) %"39", align 4
|
||||
store i64 %"45", ptr addrspace(5) %"41", align 4
|
||||
%"46" = load i64, ptr addrspace(4) %"40", align 4
|
||||
store i64 %"46", ptr addrspace(5) %"42", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"59" = inttoptr i64 %"48" to ptr
|
||||
%"47" = load float, ptr %"59", align 4
|
||||
store float %"47", ptr addrspace(5) %"43", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"60" = inttoptr i64 %"49" to ptr
|
||||
%"30" = getelementptr inbounds i8, ptr %"60", i64 4
|
||||
%"50" = load float, ptr %"30", align 4
|
||||
store float %"50", ptr addrspace(5) %"44", align 4
|
||||
%"52" = load float, ptr addrspace(5) %"43", align 4
|
||||
%2 = call float @llvm.trunc.f32(float %"52")
|
||||
%"51" = freeze float %2
|
||||
store float %"51", ptr addrspace(5) %"43", align 4
|
||||
%"54" = load float, ptr addrspace(5) %"44", align 4
|
||||
%3 = call float @llvm.trunc.f32(float %"54")
|
||||
%"53" = freeze float %3
|
||||
store float %"53", ptr addrspace(5) %"44", align 4
|
||||
%"55" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"56" = load float, ptr addrspace(5) %"43", align 4
|
||||
%"61" = inttoptr i64 %"55" to ptr
|
||||
store float %"56", ptr %"61", align 4
|
||||
%"57" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"62" = inttoptr i64 %"57" to ptr
|
||||
%"32" = getelementptr inbounds i8, ptr %"62", i64 4
|
||||
%"58" = load float, ptr addrspace(5) %"44", align 4
|
||||
store float %"58", ptr %"32", align 4
|
||||
br label %"34"
|
||||
|
||||
"34": ; preds = %1
|
||||
call void @llvm.amdgcn.s.setreg(i32 6145, i32 3)
|
||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"55" = inttoptr i64 %"44" to ptr
|
||||
%"43" = load float, ptr %"55", align 4
|
||||
store float %"43", ptr addrspace(5) %"39", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"56" = inttoptr i64 %"45" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"56", i64 4
|
||||
%"46" = load float, ptr %"31", align 4
|
||||
store float %"46", ptr addrspace(5) %"40", align 4
|
||||
%"48" = load float, ptr addrspace(5) %"39", align 4
|
||||
%2 = call float @llvm.trunc.f32(float %"48")
|
||||
%"47" = freeze float %2
|
||||
store float %"47", ptr addrspace(5) %"39", align 4
|
||||
%"50" = load float, ptr addrspace(5) %"40", align 4
|
||||
%3 = call float @llvm.trunc.f32(float %"50")
|
||||
%"49" = freeze float %3
|
||||
store float %"49", ptr addrspace(5) %"40", align 4
|
||||
%"51" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"52" = load float, ptr addrspace(5) %"39", align 4
|
||||
%"57" = inttoptr i64 %"51" to ptr
|
||||
store float %"52", ptr %"57", align 4
|
||||
%"53" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"58" = inttoptr i64 %"53" to ptr
|
||||
%"33" = getelementptr inbounds i8, ptr %"58", i64 4
|
||||
%"54" = load float, ptr addrspace(5) %"40", align 4
|
||||
store float %"54", ptr %"33", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare float @llvm.trunc.f32(float) #1
|
||||
; Function Attrs: nocallback nofree nosync nounwind willreturn
|
||||
declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare float @llvm.trunc.f32(float) #2
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind willreturn }
|
||||
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
@ -1,41 +1,32 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @cvt_s16_s8(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i32, align 4, addrspace(5)
|
||||
%"40" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @cvt_s16_s8(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i32, align 4, addrspace(5)
|
||||
%"36" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"49" = inttoptr i64 %"44" to ptr addrspace(1)
|
||||
%"43" = load i32, ptr addrspace(1) %"49", align 4
|
||||
store i32 %"43", ptr addrspace(5) %"40", align 4
|
||||
%"46" = load i32, ptr addrspace(5) %"40", align 4
|
||||
%2 = trunc i32 %"46" to i8
|
||||
%"50" = sext i8 %2 to i16
|
||||
%"45" = sext i16 %"50" to i32
|
||||
store i32 %"45", ptr addrspace(5) %"39", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"48" = load i32, ptr addrspace(5) %"39", align 4
|
||||
%"52" = inttoptr i64 %"47" to ptr
|
||||
store i32 %"48", ptr %"52", align 4
|
||||
br label %"30"
|
||||
|
||||
"30": ; preds = %1
|
||||
%"37" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"37", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
||||
%"40" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"45" = inttoptr i64 %"40" to ptr addrspace(1)
|
||||
%"39" = load i32, ptr addrspace(1) %"45", align 4
|
||||
store i32 %"39", ptr addrspace(5) %"36", align 4
|
||||
%"42" = load i32, ptr addrspace(5) %"36", align 4
|
||||
%2 = trunc i32 %"42" to i8
|
||||
%"46" = sext i8 %2 to i16
|
||||
%"41" = sext i16 %"46" to i32
|
||||
store i32 %"41", ptr addrspace(5) %"35", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"44" = load i32, ptr addrspace(5) %"35", align 4
|
||||
%"48" = inttoptr i64 %"43" to ptr
|
||||
store i32 %"44", ptr %"48", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,64 +1,55 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @cvt_s32_f32(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 {
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
%"42" = alloca i64, align 8, addrspace(5)
|
||||
%"43" = alloca i32, align 4, addrspace(5)
|
||||
%"44" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @cvt_s32_f32(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i32, align 4, addrspace(5)
|
||||
%"40" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"45" = load i64, ptr addrspace(4) %"39", align 4
|
||||
store i64 %"45", ptr addrspace(5) %"41", align 4
|
||||
%"46" = load i64, ptr addrspace(4) %"40", align 4
|
||||
store i64 %"46", ptr addrspace(5) %"42", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"60" = inttoptr i64 %"48" to ptr
|
||||
%"59" = load float, ptr %"60", align 4
|
||||
%"47" = bitcast float %"59" to i32
|
||||
store i32 %"47", ptr addrspace(5) %"43", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"61" = inttoptr i64 %"49" to ptr
|
||||
%"30" = getelementptr inbounds i8, ptr %"61", i64 4
|
||||
%"62" = load float, ptr %"30", align 4
|
||||
%"50" = bitcast float %"62" to i32
|
||||
store i32 %"50", ptr addrspace(5) %"44", align 4
|
||||
%"52" = load i32, ptr addrspace(5) %"43", align 4
|
||||
%"64" = bitcast i32 %"52" to float
|
||||
%2 = call float @llvm.ceil.f32(float %"64")
|
||||
br label %"34"
|
||||
|
||||
"34": ; preds = %1
|
||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"56" = inttoptr i64 %"44" to ptr
|
||||
%"55" = load float, ptr %"56", align 4
|
||||
%"43" = bitcast float %"55" to i32
|
||||
store i32 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"57" = inttoptr i64 %"45" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"57", i64 4
|
||||
%"58" = load float, ptr %"31", align 4
|
||||
%"46" = bitcast float %"58" to i32
|
||||
store i32 %"46", ptr addrspace(5) %"40", align 4
|
||||
%"48" = load i32, ptr addrspace(5) %"39", align 4
|
||||
%"60" = bitcast i32 %"48" to float
|
||||
%2 = call float @llvm.ceil.f32(float %"60")
|
||||
%3 = fptosi float %2 to i32
|
||||
%"63" = freeze i32 %3
|
||||
store i32 %"63", ptr addrspace(5) %"43", align 4
|
||||
%"54" = load i32, ptr addrspace(5) %"44", align 4
|
||||
%"66" = bitcast i32 %"54" to float
|
||||
%4 = call float @llvm.ceil.f32(float %"66")
|
||||
%"59" = freeze i32 %3
|
||||
store i32 %"59", ptr addrspace(5) %"39", align 4
|
||||
%"50" = load i32, ptr addrspace(5) %"40", align 4
|
||||
%"62" = bitcast i32 %"50" to float
|
||||
%4 = call float @llvm.ceil.f32(float %"62")
|
||||
%5 = fptosi float %4 to i32
|
||||
%"65" = freeze i32 %5
|
||||
store i32 %"65", ptr addrspace(5) %"44", align 4
|
||||
%"55" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"56" = load i32, ptr addrspace(5) %"43", align 4
|
||||
%"67" = inttoptr i64 %"55" to ptr addrspace(1)
|
||||
store i32 %"56", ptr addrspace(1) %"67", align 4
|
||||
%"57" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"69" = inttoptr i64 %"57" to ptr addrspace(1)
|
||||
%"32" = getelementptr inbounds i8, ptr addrspace(1) %"69", i64 4
|
||||
%"58" = load i32, ptr addrspace(5) %"44", align 4
|
||||
store i32 %"58", ptr addrspace(1) %"32", align 4
|
||||
%"61" = freeze i32 %5
|
||||
store i32 %"61", ptr addrspace(5) %"40", align 4
|
||||
%"51" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"52" = load i32, ptr addrspace(5) %"39", align 4
|
||||
%"63" = inttoptr i64 %"51" to ptr addrspace(1)
|
||||
store i32 %"52", ptr addrspace(1) %"63", align 4
|
||||
%"53" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"65" = inttoptr i64 %"53" to ptr addrspace(1)
|
||||
%"33" = getelementptr inbounds i8, ptr addrspace(1) %"65", i64 4
|
||||
%"54" = load i32, ptr addrspace(5) %"40", align 4
|
||||
store i32 %"54", ptr addrspace(1) %"33", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare float @llvm.ceil.f32(float) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
@ -1,39 +1,30 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @cvt_s64_s32(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i32, align 4, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
define amdgpu_kernel void @cvt_s64_s32(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i32, align 4, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"50" = inttoptr i64 %"44" to ptr
|
||||
%"49" = load i32, ptr %"50", align 4
|
||||
store i32 %"49", ptr addrspace(5) %"39", align 4
|
||||
%"46" = load i32, ptr addrspace(5) %"39", align 4
|
||||
%"45" = sext i32 %"46" to i64
|
||||
store i64 %"45", ptr addrspace(5) %"40", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"51" = inttoptr i64 %"47" to ptr
|
||||
store i64 %"48", ptr %"51", align 4
|
||||
br label %"30"
|
||||
|
||||
"30": ; preds = %1
|
||||
%"37" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"37", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
||||
%"40" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"46" = inttoptr i64 %"40" to ptr
|
||||
%"45" = load i32, ptr %"46", align 4
|
||||
store i32 %"45", ptr addrspace(5) %"35", align 4
|
||||
%"42" = load i32, ptr addrspace(5) %"35", align 4
|
||||
%"41" = sext i32 %"42" to i64
|
||||
store i64 %"41", ptr addrspace(5) %"36", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"47" = inttoptr i64 %"43" to ptr
|
||||
store i64 %"44", ptr %"47", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,42 +1,33 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @cvt_sat_s_u(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i32, align 4, addrspace(5)
|
||||
%"41" = alloca i32, align 4, addrspace(5)
|
||||
%"42" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @cvt_sat_s_u(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i32, align 4, addrspace(5)
|
||||
%"37" = alloca i32, align 4, addrspace(5)
|
||||
%"38" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"43" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"39", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"53" = inttoptr i64 %"46" to ptr
|
||||
%"45" = load i32, ptr %"53", align 4
|
||||
store i32 %"45", ptr addrspace(5) %"40", align 4
|
||||
%"48" = load i32, ptr addrspace(5) %"40", align 4
|
||||
%2 = call i32 @llvm.smax.i32(i32 %"48", i32 0)
|
||||
br label %"31"
|
||||
|
||||
"31": ; preds = %1
|
||||
%"39" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"34", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"33", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"35", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"49" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load i32, ptr %"49", align 4
|
||||
store i32 %"41", ptr addrspace(5) %"36", align 4
|
||||
%"44" = load i32, ptr addrspace(5) %"36", align 4
|
||||
%2 = call i32 @llvm.smax.i32(i32 %"44", i32 0)
|
||||
%3 = call i32 @llvm.umin.i32(i32 %2, i32 -1)
|
||||
store i32 %3, ptr addrspace(5) %"41", align 4
|
||||
%"50" = load i32, ptr addrspace(5) %"41", align 4
|
||||
store i32 %"50", ptr addrspace(5) %"42", align 4
|
||||
%"51" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"52" = load i32, ptr addrspace(5) %"42", align 4
|
||||
%"54" = inttoptr i64 %"51" to ptr
|
||||
store i32 %"52", ptr %"54", align 4
|
||||
store i32 %3, ptr addrspace(5) %"37", align 4
|
||||
%"46" = load i32, ptr addrspace(5) %"37", align 4
|
||||
store i32 %"46", ptr addrspace(5) %"38", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"48" = load i32, ptr addrspace(5) %"38", align 4
|
||||
%"50" = inttoptr i64 %"47" to ptr
|
||||
store i32 %"48", ptr %"50", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -46,5 +37,5 @@ declare i32 @llvm.smax.i32(i32, i32) #1
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare i32 @llvm.umin.i32(i32, i32) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
@ -1,43 +1,34 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @cvta(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca float, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @cvta(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
||||
%"32" = alloca i64, align 8, addrspace(5)
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"39" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"36", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%2 = inttoptr i64 %"42" to ptr
|
||||
%"49" = addrspacecast ptr %2 to ptr addrspace(1)
|
||||
store ptr addrspace(1) %"49", ptr addrspace(5) %"36", align 8
|
||||
%"44" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%3 = inttoptr i64 %"44" to ptr
|
||||
%"51" = addrspacecast ptr %3 to ptr addrspace(1)
|
||||
store ptr addrspace(1) %"51", ptr addrspace(5) %"37", align 8
|
||||
%"46" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"53" = inttoptr i64 %"46" to ptr addrspace(1)
|
||||
%"45" = load float, ptr addrspace(1) %"53", align 4
|
||||
store float %"45", ptr addrspace(5) %"38", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"48" = load float, ptr addrspace(5) %"38", align 4
|
||||
%"54" = inttoptr i64 %"47" to ptr addrspace(1)
|
||||
store float %"48", ptr addrspace(1) %"54", align 4
|
||||
br label %"29"
|
||||
|
||||
"29": ; preds = %1
|
||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
||||
%2 = inttoptr i64 %"38" to ptr
|
||||
%"45" = addrspacecast ptr %2 to ptr addrspace(1)
|
||||
store ptr addrspace(1) %"45", ptr addrspace(5) %"32", align 8
|
||||
%"40" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%3 = inttoptr i64 %"40" to ptr
|
||||
%"47" = addrspacecast ptr %3 to ptr addrspace(1)
|
||||
store ptr addrspace(1) %"47", ptr addrspace(5) %"33", align 8
|
||||
%"42" = load i64, ptr addrspace(5) %"32", align 4
|
||||
%"49" = inttoptr i64 %"42" to ptr addrspace(1)
|
||||
%"41" = load float, ptr addrspace(1) %"49", align 4
|
||||
store float %"41", ptr addrspace(5) %"34", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"44" = load float, ptr addrspace(5) %"34", align 4
|
||||
%"50" = inttoptr i64 %"43" to ptr addrspace(1)
|
||||
store float %"44", ptr addrspace(1) %"50", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,45 +1,36 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @div_approx(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca float, align 4, addrspace(5)
|
||||
%"42" = alloca float, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @div_approx(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca float, align 4, addrspace(5)
|
||||
%"38" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"44" = load i64, ptr addrspace(4) %"38", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"54" = inttoptr i64 %"46" to ptr
|
||||
%"45" = load float, ptr %"54", align 4
|
||||
store float %"45", ptr addrspace(5) %"41", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"55" = inttoptr i64 %"47" to ptr
|
||||
%"30" = getelementptr inbounds i8, ptr %"55", i64 4
|
||||
%"48" = load float, ptr %"30", align 4
|
||||
store float %"48", ptr addrspace(5) %"42", align 4
|
||||
%"50" = load float, ptr addrspace(5) %"41", align 4
|
||||
%"51" = load float, ptr addrspace(5) %"42", align 4
|
||||
%"49" = fdiv arcp afn float %"50", %"51"
|
||||
store float %"49", ptr addrspace(5) %"41", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"53" = load float, ptr addrspace(5) %"41", align 4
|
||||
%"56" = inttoptr i64 %"52" to ptr
|
||||
store float %"53", ptr %"56", align 4
|
||||
br label %"32"
|
||||
|
||||
"32": ; preds = %1
|
||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"50" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load float, ptr %"50", align 4
|
||||
store float %"41", ptr addrspace(5) %"37", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"51" = inttoptr i64 %"43" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
|
||||
%"44" = load float, ptr %"31", align 4
|
||||
store float %"44", ptr addrspace(5) %"38", align 4
|
||||
%"46" = load float, ptr addrspace(5) %"37", align 4
|
||||
%"47" = load float, ptr addrspace(5) %"38", align 4
|
||||
%"45" = fdiv arcp afn float %"46", %"47"
|
||||
store float %"45", ptr addrspace(5) %"37", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"49" = load float, ptr addrspace(5) %"37", align 4
|
||||
%"52" = inttoptr i64 %"48" to ptr
|
||||
store float %"49", ptr %"52", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,42 +1,33 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca float, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
||||
%"32" = alloca i64, align 8, addrspace(5)
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"39" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"36", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"47" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load float, ptr %"47", align 4
|
||||
store float %"41", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load float, ptr addrspace(5) %"38", align 4
|
||||
%"43" = call float @llvm.amdgcn.exp2.f32(float %"44")
|
||||
store float %"43", ptr addrspace(5) %"38", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"46" = load float, ptr addrspace(5) %"38", align 4
|
||||
%"48" = inttoptr i64 %"45" to ptr
|
||||
store float %"46", ptr %"48", align 4
|
||||
br label %"29"
|
||||
|
||||
"29": ; preds = %1
|
||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
||||
%"43" = inttoptr i64 %"38" to ptr
|
||||
%"37" = load float, ptr %"43", align 4
|
||||
store float %"37", ptr addrspace(5) %"34", align 4
|
||||
%"40" = load float, ptr addrspace(5) %"34", align 4
|
||||
%"39" = call float @llvm.amdgcn.exp2.f32(float %"40")
|
||||
store float %"39", ptr addrspace(5) %"34", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"42" = load float, ptr addrspace(5) %"34", align 4
|
||||
%"44" = inttoptr i64 %"41" to ptr
|
||||
store float %"42", ptr %"44", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare float @llvm.amdgcn.exp2.f32(float) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
@ -1,41 +1,32 @@
|
||||
@shared_mem = external addrspace(3) global [0 x i32]
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @extern_shared(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
define amdgpu_kernel void @extern_shared(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"40" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"37", align 4
|
||||
%"41" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"38", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"48" = inttoptr i64 %"43" to ptr addrspace(1)
|
||||
%"42" = load i64, ptr addrspace(1) %"48", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"39", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"39", align 4
|
||||
store i64 %"44", ptr addrspace(3) @shared_mem, align 4
|
||||
%"45" = load i64, ptr addrspace(3) @shared_mem, align 4
|
||||
store i64 %"45", ptr addrspace(5) %"39", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"51" = inttoptr i64 %"46" to ptr addrspace(1)
|
||||
store i64 %"47", ptr addrspace(1) %"51", align 4
|
||||
br label %"30"
|
||||
|
||||
"30": ; preds = %1
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"37" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"37", ptr addrspace(5) %"34", align 4
|
||||
%"39" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"44" = inttoptr i64 %"39" to ptr addrspace(1)
|
||||
%"38" = load i64, ptr addrspace(1) %"44", align 4
|
||||
store i64 %"38", ptr addrspace(5) %"35", align 4
|
||||
%"40" = load i64, ptr addrspace(5) %"35", align 4
|
||||
store i64 %"40", ptr addrspace(3) @shared_mem, align 4
|
||||
%"41" = load i64, ptr addrspace(3) @shared_mem, align 4
|
||||
store i64 %"41", ptr addrspace(5) %"35", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"47" = inttoptr i64 %"42" to ptr addrspace(1)
|
||||
store i64 %"43", ptr addrspace(1) %"47", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,57 +1,55 @@
|
||||
@shared_mem = external addrspace(3) global [0 x i32], align 4
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define void @__zluda_ptx_impl_incr_shared_2_global() #0 {
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
define void @incr_shared_2_global() #0 {
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"39" = load i64, ptr addrspace(3) @shared_mem, align 4
|
||||
store i64 %"39", ptr addrspace(5) %"38", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"40" = add i64 %"41", 2
|
||||
store i64 %"40", ptr addrspace(5) %"38", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"38", align 4
|
||||
store i64 %"42", ptr addrspace(3) @shared_mem, align 4
|
||||
br label %"33"
|
||||
|
||||
"33": ; preds = %1
|
||||
%"37" = load i64, ptr addrspace(3) @shared_mem, align 4
|
||||
store i64 %"37", ptr addrspace(5) %"36", align 4
|
||||
%"39" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"38" = add i64 %"39", 2
|
||||
store i64 %"38", ptr addrspace(5) %"36", align 4
|
||||
%"40" = load i64, ptr addrspace(5) %"36", align 4
|
||||
store i64 %"40", ptr addrspace(3) @shared_mem, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @extern_shared_call(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44") #0 {
|
||||
define amdgpu_kernel void @extern_shared_call(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #1 {
|
||||
%"43" = alloca i64, align 8, addrspace(5)
|
||||
%"44" = alloca i64, align 8, addrspace(5)
|
||||
%"45" = alloca i64, align 8, addrspace(5)
|
||||
%"46" = alloca i64, align 8, addrspace(5)
|
||||
%"47" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"48" = load i64, ptr addrspace(4) %"43", align 4
|
||||
br label %"34"
|
||||
|
||||
"34": ; preds = %1
|
||||
%"46" = load i64, ptr addrspace(4) %"41", align 4
|
||||
store i64 %"46", ptr addrspace(5) %"43", align 4
|
||||
%"47" = load i64, ptr addrspace(4) %"42", align 4
|
||||
store i64 %"47", ptr addrspace(5) %"44", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"43", align 4
|
||||
%"56" = inttoptr i64 %"49" to ptr addrspace(1)
|
||||
%"48" = load i64, ptr addrspace(1) %"56", align 4
|
||||
store i64 %"48", ptr addrspace(5) %"45", align 4
|
||||
%"49" = load i64, ptr addrspace(4) %"44", align 4
|
||||
store i64 %"49", ptr addrspace(5) %"46", align 4
|
||||
%"51" = load i64, ptr addrspace(5) %"45", align 4
|
||||
%"58" = inttoptr i64 %"51" to ptr addrspace(1)
|
||||
%"50" = load i64, ptr addrspace(1) %"58", align 4
|
||||
store i64 %"50", ptr addrspace(5) %"47", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"47", align 4
|
||||
store i64 %"52", ptr addrspace(3) @shared_mem, align 4
|
||||
call void @__zluda_ptx_impl_incr_shared_2_global()
|
||||
%"53" = load i64, ptr addrspace(3) @shared_mem, align 4
|
||||
store i64 %"53", ptr addrspace(5) %"47", align 4
|
||||
%"54" = load i64, ptr addrspace(5) %"46", align 4
|
||||
%"55" = load i64, ptr addrspace(5) %"47", align 4
|
||||
%"61" = inttoptr i64 %"54" to ptr addrspace(1)
|
||||
store i64 %"55", ptr addrspace(1) %"61", align 4
|
||||
%"50" = load i64, ptr addrspace(5) %"45", align 4
|
||||
store i64 %"50", ptr addrspace(3) @shared_mem, align 4
|
||||
call void @incr_shared_2_global()
|
||||
br label %"35"
|
||||
|
||||
"35": ; preds = %"34"
|
||||
%"51" = load i64, ptr addrspace(3) @shared_mem, align 4
|
||||
store i64 %"51", ptr addrspace(5) %"45", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"44", align 4
|
||||
%"53" = load i64, ptr addrspace(5) %"45", align 4
|
||||
%"59" = inttoptr i64 %"52" to ptr addrspace(1)
|
||||
store i64 %"53", ptr addrspace(1) %"59", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,56 +1,47 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @fma(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 {
|
||||
%"42" = alloca i64, align 8, addrspace(5)
|
||||
%"43" = alloca i64, align 8, addrspace(5)
|
||||
%"44" = alloca float, align 4, addrspace(5)
|
||||
%"45" = alloca float, align 4, addrspace(5)
|
||||
%"46" = alloca float, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @fma(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca float, align 4, addrspace(5)
|
||||
%"41" = alloca float, align 4, addrspace(5)
|
||||
%"42" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"47" = load i64, ptr addrspace(4) %"40", align 4
|
||||
store i64 %"47", ptr addrspace(5) %"42", align 4
|
||||
%"48" = load i64, ptr addrspace(4) %"41", align 4
|
||||
store i64 %"48", ptr addrspace(5) %"43", align 4
|
||||
%"50" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"61" = inttoptr i64 %"50" to ptr
|
||||
%"49" = load float, ptr %"61", align 4
|
||||
store float %"49", ptr addrspace(5) %"44", align 4
|
||||
%"51" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"62" = inttoptr i64 %"51" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"62", i64 4
|
||||
%"52" = load float, ptr %"31", align 4
|
||||
store float %"52", ptr addrspace(5) %"45", align 4
|
||||
%"53" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"63" = inttoptr i64 %"53" to ptr
|
||||
%"33" = getelementptr inbounds i8, ptr %"63", i64 8
|
||||
%"54" = load float, ptr %"33", align 4
|
||||
store float %"54", ptr addrspace(5) %"46", align 4
|
||||
%"56" = load float, ptr addrspace(5) %"44", align 4
|
||||
%"57" = load float, ptr addrspace(5) %"45", align 4
|
||||
%"58" = load float, ptr addrspace(5) %"46", align 4
|
||||
%"55" = call float @llvm.fma.f32(float %"56", float %"57", float %"58")
|
||||
store float %"55", ptr addrspace(5) %"44", align 4
|
||||
%"59" = load i64, ptr addrspace(5) %"43", align 4
|
||||
%"60" = load float, ptr addrspace(5) %"44", align 4
|
||||
%"64" = inttoptr i64 %"59" to ptr
|
||||
store float %"60", ptr %"64", align 4
|
||||
br label %"35"
|
||||
|
||||
"35": ; preds = %1
|
||||
%"43" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"39", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"57" = inttoptr i64 %"46" to ptr
|
||||
%"45" = load float, ptr %"57", align 4
|
||||
store float %"45", ptr addrspace(5) %"40", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"58" = inttoptr i64 %"47" to ptr
|
||||
%"32" = getelementptr inbounds i8, ptr %"58", i64 4
|
||||
%"48" = load float, ptr %"32", align 4
|
||||
store float %"48", ptr addrspace(5) %"41", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"59" = inttoptr i64 %"49" to ptr
|
||||
%"34" = getelementptr inbounds i8, ptr %"59", i64 8
|
||||
%"50" = load float, ptr %"34", align 4
|
||||
store float %"50", ptr addrspace(5) %"42", align 4
|
||||
%"52" = load float, ptr addrspace(5) %"40", align 4
|
||||
%"53" = load float, ptr addrspace(5) %"41", align 4
|
||||
%"54" = load float, ptr addrspace(5) %"42", align 4
|
||||
%"51" = call float @llvm.fma.f32(float %"52", float %"53", float %"54")
|
||||
store float %"51", ptr addrspace(5) %"40", align 4
|
||||
%"55" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"56" = load float, ptr addrspace(5) %"40", align 4
|
||||
%"60" = inttoptr i64 %"55" to ptr
|
||||
store float %"56", ptr %"60", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare float @llvm.fma.f32(float, float, float) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
@ -1,36 +1,27 @@
|
||||
@foobar = addrspace(1) global [4 x i32] [i32 1, i32 0, i32 0, i32 0]
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @global_array(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @global_array(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
store i64 ptrtoint (ptr addrspace(1) @foobar to i64), ptr addrspace(5) %"37", align 4
|
||||
%"41" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"38", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"47" = inttoptr i64 %"43" to ptr addrspace(1)
|
||||
%"42" = load i32, ptr addrspace(1) %"47", align 4
|
||||
store i32 %"42", ptr addrspace(5) %"39", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"45" = load i32, ptr addrspace(5) %"39", align 4
|
||||
%"48" = inttoptr i64 %"44" to ptr addrspace(1)
|
||||
store i32 %"45", ptr addrspace(1) %"48", align 4
|
||||
br label %"30"
|
||||
|
||||
"30": ; preds = %1
|
||||
store i64 ptrtoint (ptr addrspace(1) @foobar to i64), ptr addrspace(5) %"33", align 4
|
||||
%"37" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"37", ptr addrspace(5) %"34", align 4
|
||||
%"39" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"43" = inttoptr i64 %"39" to ptr addrspace(1)
|
||||
%"38" = load i32, ptr addrspace(1) %"43", align 4
|
||||
store i32 %"38", ptr addrspace(5) %"35", align 4
|
||||
%"40" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"41" = load i32, ptr addrspace(5) %"35", align 4
|
||||
%"44" = inttoptr i64 %"40" to ptr addrspace(1)
|
||||
store i32 %"41", ptr addrspace(1) %"44", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,35 +1,26 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @ld_st(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
define amdgpu_kernel void @ld_st(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
||||
%"32" = alloca i64, align 8, addrspace(5)
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"39" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"36", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"45" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load i64, ptr %"45", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"38", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"46" = inttoptr i64 %"43" to ptr
|
||||
store i64 %"44", ptr %"46", align 4
|
||||
br label %"29"
|
||||
|
||||
"29": ; preds = %1
|
||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
||||
%"41" = inttoptr i64 %"38" to ptr
|
||||
%"37" = load i64, ptr %"41", align 4
|
||||
store i64 %"37", ptr addrspace(5) %"34", align 4
|
||||
%"39" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"40" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"42" = inttoptr i64 %"39" to ptr
|
||||
store i64 %"40", ptr %"42", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,40 +1,31 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @ld_st_implicit(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
define amdgpu_kernel void @ld_st_implicit(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"40" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"37", align 4
|
||||
%"41" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"38", align 4
|
||||
store i64 81985529216486895, ptr addrspace(5) %"39", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"48" = inttoptr i64 %"44" to ptr addrspace(1)
|
||||
%"47" = load float, ptr addrspace(1) %"48", align 4
|
||||
%2 = bitcast float %"47" to i32
|
||||
%"43" = zext i32 %2 to i64
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"49" = inttoptr i64 %"45" to ptr addrspace(1)
|
||||
%3 = trunc i64 %"46" to i32
|
||||
%"50" = bitcast i32 %3 to float
|
||||
store float %"50", ptr addrspace(1) %"49", align 4
|
||||
br label %"30"
|
||||
|
||||
"30": ; preds = %1
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"37" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"37", ptr addrspace(5) %"34", align 4
|
||||
store i64 81985529216486895, ptr addrspace(5) %"35", align 4
|
||||
%"40" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"44" = inttoptr i64 %"40" to ptr addrspace(1)
|
||||
%"43" = load float, ptr addrspace(1) %"44", align 4
|
||||
%2 = bitcast float %"43" to i32
|
||||
%"39" = zext i32 %2 to i64
|
||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"45" = inttoptr i64 %"41" to ptr addrspace(1)
|
||||
%3 = trunc i64 %"42" to i32
|
||||
%"46" = bitcast i32 %3 to float
|
||||
store float %"46", ptr addrspace(1) %"45", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,46 +1,37 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @ld_st_offset(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 {
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
%"42" = alloca i64, align 8, addrspace(5)
|
||||
%"43" = alloca i32, align 4, addrspace(5)
|
||||
%"44" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @ld_st_offset(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i32, align 4, addrspace(5)
|
||||
%"40" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"45" = load i64, ptr addrspace(4) %"39", align 4
|
||||
store i64 %"45", ptr addrspace(5) %"41", align 4
|
||||
%"46" = load i64, ptr addrspace(4) %"40", align 4
|
||||
store i64 %"46", ptr addrspace(5) %"42", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"55" = inttoptr i64 %"48" to ptr
|
||||
%"47" = load i32, ptr %"55", align 4
|
||||
store i32 %"47", ptr addrspace(5) %"43", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"56" = inttoptr i64 %"49" to ptr
|
||||
%"30" = getelementptr inbounds i8, ptr %"56", i64 4
|
||||
%"50" = load i32, ptr %"30", align 4
|
||||
store i32 %"50", ptr addrspace(5) %"44", align 4
|
||||
%"51" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"52" = load i32, ptr addrspace(5) %"44", align 4
|
||||
%"57" = inttoptr i64 %"51" to ptr
|
||||
store i32 %"52", ptr %"57", align 4
|
||||
%"53" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"58" = inttoptr i64 %"53" to ptr
|
||||
%"32" = getelementptr inbounds i8, ptr %"58", i64 4
|
||||
%"54" = load i32, ptr addrspace(5) %"43", align 4
|
||||
store i32 %"54", ptr %"32", align 4
|
||||
br label %"34"
|
||||
|
||||
"34": ; preds = %1
|
||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"51" = inttoptr i64 %"44" to ptr
|
||||
%"43" = load i32, ptr %"51", align 4
|
||||
store i32 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"52" = inttoptr i64 %"45" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"52", i64 4
|
||||
%"46" = load i32, ptr %"31", align 4
|
||||
store i32 %"46", ptr addrspace(5) %"40", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"48" = load i32, ptr addrspace(5) %"40", align 4
|
||||
%"53" = inttoptr i64 %"47" to ptr
|
||||
store i32 %"48", ptr %"53", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"54" = inttoptr i64 %"49" to ptr
|
||||
%"33" = getelementptr inbounds i8, ptr %"54", i64 4
|
||||
%"50" = load i32, ptr addrspace(5) %"39", align 4
|
||||
store i32 %"50", ptr %"33", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,42 +1,33 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @lg2(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca float, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @lg2(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
||||
%"32" = alloca i64, align 8, addrspace(5)
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"39" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"36", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"47" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load float, ptr %"47", align 4
|
||||
store float %"41", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load float, ptr addrspace(5) %"38", align 4
|
||||
%"43" = call float @llvm.amdgcn.log.f32(float %"44")
|
||||
store float %"43", ptr addrspace(5) %"38", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"46" = load float, ptr addrspace(5) %"38", align 4
|
||||
%"48" = inttoptr i64 %"45" to ptr
|
||||
store float %"46", ptr %"48", align 4
|
||||
br label %"29"
|
||||
|
||||
"29": ; preds = %1
|
||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
||||
%"43" = inttoptr i64 %"38" to ptr
|
||||
%"37" = load float, ptr %"43", align 4
|
||||
store float %"37", ptr addrspace(5) %"34", align 4
|
||||
%"40" = load float, ptr addrspace(5) %"34", align 4
|
||||
%"39" = call float @llvm.amdgcn.log.f32(float %"40")
|
||||
store float %"39", ptr addrspace(5) %"34", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"42" = load float, ptr addrspace(5) %"34", align 4
|
||||
%"44" = inttoptr i64 %"41" to ptr
|
||||
store float %"42", ptr %"44", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare float @llvm.amdgcn.log.f32(float) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
@ -1,36 +1,27 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @local_align(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
||||
%"9" = alloca [8 x i8], align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
define amdgpu_kernel void @local_align(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
||||
%"10" = alloca [8 x i8], align 8, addrspace(5)
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"40" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"37", align 4
|
||||
%"41" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"38", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"46" = inttoptr i64 %"43" to ptr
|
||||
%"42" = load i64, ptr %"46", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"39", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"47" = inttoptr i64 %"44" to ptr
|
||||
store i64 %"45", ptr %"47", align 4
|
||||
br label %"30"
|
||||
|
||||
"30": ; preds = %1
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"37" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"37", ptr addrspace(5) %"34", align 4
|
||||
%"39" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"42" = inttoptr i64 %"39" to ptr
|
||||
%"38" = load i64, ptr %"42", align 4
|
||||
store i64 %"38", ptr addrspace(5) %"35", align 4
|
||||
%"40" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"43" = inttoptr i64 %"40" to ptr
|
||||
store i64 %"41", ptr %"43", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,64 +1,55 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @mad_s32(ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #0 {
|
||||
%"47" = alloca i64, align 8, addrspace(5)
|
||||
%"48" = alloca i64, align 8, addrspace(5)
|
||||
%"49" = alloca i32, align 4, addrspace(5)
|
||||
%"50" = alloca i32, align 4, addrspace(5)
|
||||
%"51" = alloca i32, align 4, addrspace(5)
|
||||
%"52" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @mad_s32(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 {
|
||||
%"43" = alloca i64, align 8, addrspace(5)
|
||||
%"44" = alloca i64, align 8, addrspace(5)
|
||||
%"45" = alloca i32, align 4, addrspace(5)
|
||||
%"46" = alloca i32, align 4, addrspace(5)
|
||||
%"47" = alloca i32, align 4, addrspace(5)
|
||||
%"48" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"53" = load i64, ptr addrspace(4) %"45", align 4
|
||||
store i64 %"53", ptr addrspace(5) %"47", align 4
|
||||
%"54" = load i64, ptr addrspace(4) %"46", align 4
|
||||
store i64 %"54", ptr addrspace(5) %"48", align 4
|
||||
%"56" = load i64, ptr addrspace(5) %"47", align 4
|
||||
%"71" = inttoptr i64 %"56" to ptr
|
||||
%"55" = load i32, ptr %"71", align 4
|
||||
store i32 %"55", ptr addrspace(5) %"50", align 4
|
||||
%"57" = load i64, ptr addrspace(5) %"47", align 4
|
||||
%"72" = inttoptr i64 %"57" to ptr
|
||||
%"32" = getelementptr inbounds i8, ptr %"72", i64 4
|
||||
%"58" = load i32, ptr %"32", align 4
|
||||
store i32 %"58", ptr addrspace(5) %"51", align 4
|
||||
%"59" = load i64, ptr addrspace(5) %"47", align 4
|
||||
%"73" = inttoptr i64 %"59" to ptr
|
||||
%"34" = getelementptr inbounds i8, ptr %"73", i64 8
|
||||
%"60" = load i32, ptr %"34", align 4
|
||||
store i32 %"60", ptr addrspace(5) %"52", align 4
|
||||
%"62" = load i32, ptr addrspace(5) %"50", align 4
|
||||
%"63" = load i32, ptr addrspace(5) %"51", align 4
|
||||
%"64" = load i32, ptr addrspace(5) %"52", align 4
|
||||
%2 = mul i32 %"62", %"63"
|
||||
%"61" = add i32 %2, %"64"
|
||||
store i32 %"61", ptr addrspace(5) %"49", align 4
|
||||
%"65" = load i64, ptr addrspace(5) %"48", align 4
|
||||
%"66" = load i32, ptr addrspace(5) %"49", align 4
|
||||
%"74" = inttoptr i64 %"65" to ptr
|
||||
store i32 %"66", ptr %"74", align 4
|
||||
%"67" = load i64, ptr addrspace(5) %"48", align 4
|
||||
%"75" = inttoptr i64 %"67" to ptr
|
||||
%"36" = getelementptr inbounds i8, ptr %"75", i64 4
|
||||
%"68" = load i32, ptr addrspace(5) %"49", align 4
|
||||
store i32 %"68", ptr %"36", align 4
|
||||
%"69" = load i64, ptr addrspace(5) %"48", align 4
|
||||
%"76" = inttoptr i64 %"69" to ptr
|
||||
%"38" = getelementptr inbounds i8, ptr %"76", i64 8
|
||||
%"70" = load i32, ptr addrspace(5) %"49", align 4
|
||||
store i32 %"70", ptr %"38", align 4
|
||||
br label %"40"
|
||||
|
||||
"40": ; preds = %1
|
||||
%"49" = load i64, ptr addrspace(4) %"41", align 4
|
||||
store i64 %"49", ptr addrspace(5) %"43", align 4
|
||||
%"50" = load i64, ptr addrspace(4) %"42", align 4
|
||||
store i64 %"50", ptr addrspace(5) %"44", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"43", align 4
|
||||
%"67" = inttoptr i64 %"52" to ptr
|
||||
%"51" = load i32, ptr %"67", align 4
|
||||
store i32 %"51", ptr addrspace(5) %"46", align 4
|
||||
%"53" = load i64, ptr addrspace(5) %"43", align 4
|
||||
%"68" = inttoptr i64 %"53" to ptr
|
||||
%"33" = getelementptr inbounds i8, ptr %"68", i64 4
|
||||
%"54" = load i32, ptr %"33", align 4
|
||||
store i32 %"54", ptr addrspace(5) %"47", align 4
|
||||
%"55" = load i64, ptr addrspace(5) %"43", align 4
|
||||
%"69" = inttoptr i64 %"55" to ptr
|
||||
%"35" = getelementptr inbounds i8, ptr %"69", i64 8
|
||||
%"56" = load i32, ptr %"35", align 4
|
||||
store i32 %"56", ptr addrspace(5) %"48", align 4
|
||||
%"58" = load i32, ptr addrspace(5) %"46", align 4
|
||||
%"59" = load i32, ptr addrspace(5) %"47", align 4
|
||||
%"60" = load i32, ptr addrspace(5) %"48", align 4
|
||||
%2 = mul i32 %"58", %"59"
|
||||
%"57" = add i32 %2, %"60"
|
||||
store i32 %"57", ptr addrspace(5) %"45", align 4
|
||||
%"61" = load i64, ptr addrspace(5) %"44", align 4
|
||||
%"62" = load i32, ptr addrspace(5) %"45", align 4
|
||||
%"70" = inttoptr i64 %"61" to ptr
|
||||
store i32 %"62", ptr %"70", align 4
|
||||
%"63" = load i64, ptr addrspace(5) %"44", align 4
|
||||
%"71" = inttoptr i64 %"63" to ptr
|
||||
%"37" = getelementptr inbounds i8, ptr %"71", i64 4
|
||||
%"64" = load i32, ptr addrspace(5) %"45", align 4
|
||||
store i32 %"64", ptr %"37", align 4
|
||||
%"65" = load i64, ptr addrspace(5) %"44", align 4
|
||||
%"72" = inttoptr i64 %"65" to ptr
|
||||
%"39" = getelementptr inbounds i8, ptr %"72", i64 8
|
||||
%"66" = load i32, ptr addrspace(5) %"45", align 4
|
||||
store i32 %"66", ptr %"39", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
33
ptx/src/test/ll/malformed_label.ll
Normal file
33
ptx/src/test/ll/malformed_label.ll
Normal file
@ -0,0 +1,33 @@
|
||||
define amdgpu_kernel void @malformed_label(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"32"
|
||||
|
||||
"32": ; preds = %1
|
||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
||||
br label %"10"
|
||||
|
||||
"10": ; preds = %"32"
|
||||
%"43" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"48" = inttoptr i64 %"43" to ptr
|
||||
%"42" = load i64, ptr %"48", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"44" = add i64 %"45", 1
|
||||
store i64 %"44", ptr addrspace(5) %"39", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"49" = inttoptr i64 %"46" to ptr
|
||||
store i64 %"47", ptr %"49", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,49 +1,40 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @max(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i32, align 4, addrspace(5)
|
||||
%"42" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @max(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i32, align 4, addrspace(5)
|
||||
%"38" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"44" = load i64, ptr addrspace(4) %"38", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"54" = inttoptr i64 %"46" to ptr
|
||||
%"45" = load i32, ptr %"54", align 4
|
||||
store i32 %"45", ptr addrspace(5) %"41", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"55" = inttoptr i64 %"47" to ptr
|
||||
%"30" = getelementptr inbounds i8, ptr %"55", i64 4
|
||||
%"48" = load i32, ptr %"30", align 4
|
||||
store i32 %"48", ptr addrspace(5) %"42", align 4
|
||||
%"50" = load i32, ptr addrspace(5) %"41", align 4
|
||||
%"51" = load i32, ptr addrspace(5) %"42", align 4
|
||||
%"49" = call i32 @llvm.smax.i32(i32 %"50", i32 %"51")
|
||||
store i32 %"49", ptr addrspace(5) %"41", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"53" = load i32, ptr addrspace(5) %"41", align 4
|
||||
%"56" = inttoptr i64 %"52" to ptr
|
||||
store i32 %"53", ptr %"56", align 4
|
||||
br label %"32"
|
||||
|
||||
"32": ; preds = %1
|
||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"50" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load i32, ptr %"50", align 4
|
||||
store i32 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"51" = inttoptr i64 %"43" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
|
||||
%"44" = load i32, ptr %"31", align 4
|
||||
store i32 %"44", ptr addrspace(5) %"38", align 4
|
||||
%"46" = load i32, ptr addrspace(5) %"37", align 4
|
||||
%"47" = load i32, ptr addrspace(5) %"38", align 4
|
||||
%"45" = call i32 @llvm.smax.i32(i32 %"46", i32 %"47")
|
||||
store i32 %"45", ptr addrspace(5) %"37", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"49" = load i32, ptr addrspace(5) %"37", align 4
|
||||
%"52" = inttoptr i64 %"48" to ptr
|
||||
store i32 %"49", ptr %"52", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare i32 @llvm.smax.i32(i32, i32) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
@ -1,36 +1,27 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @membar(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @membar(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
||||
%"32" = alloca i64, align 8, addrspace(5)
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"39" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"36", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"46" = inttoptr i64 %"42" to ptr
|
||||
%"45" = load i32, ptr %"46", align 4
|
||||
store i32 %"45", ptr addrspace(5) %"38", align 4
|
||||
br label %"29"
|
||||
|
||||
"29": ; preds = %1
|
||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
||||
%"42" = inttoptr i64 %"38" to ptr
|
||||
%"41" = load i32, ptr %"42", align 4
|
||||
store i32 %"41", ptr addrspace(5) %"34", align 4
|
||||
fence seq_cst
|
||||
%"43" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"44" = load i32, ptr addrspace(5) %"38", align 4
|
||||
%"47" = inttoptr i64 %"43" to ptr
|
||||
store i32 %"44", ptr %"47", align 4
|
||||
%"39" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"40" = load i32, ptr addrspace(5) %"34", align 4
|
||||
%"43" = inttoptr i64 %"39" to ptr
|
||||
store i32 %"40", ptr %"43", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,49 +1,40 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @min(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i32, align 4, addrspace(5)
|
||||
%"42" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @min(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i32, align 4, addrspace(5)
|
||||
%"38" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"44" = load i64, ptr addrspace(4) %"38", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"54" = inttoptr i64 %"46" to ptr
|
||||
%"45" = load i32, ptr %"54", align 4
|
||||
store i32 %"45", ptr addrspace(5) %"41", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"55" = inttoptr i64 %"47" to ptr
|
||||
%"30" = getelementptr inbounds i8, ptr %"55", i64 4
|
||||
%"48" = load i32, ptr %"30", align 4
|
||||
store i32 %"48", ptr addrspace(5) %"42", align 4
|
||||
%"50" = load i32, ptr addrspace(5) %"41", align 4
|
||||
%"51" = load i32, ptr addrspace(5) %"42", align 4
|
||||
%"49" = call i32 @llvm.smin.i32(i32 %"50", i32 %"51")
|
||||
store i32 %"49", ptr addrspace(5) %"41", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"53" = load i32, ptr addrspace(5) %"41", align 4
|
||||
%"56" = inttoptr i64 %"52" to ptr
|
||||
store i32 %"53", ptr %"56", align 4
|
||||
br label %"32"
|
||||
|
||||
"32": ; preds = %1
|
||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"50" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load i32, ptr %"50", align 4
|
||||
store i32 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"51" = inttoptr i64 %"43" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
|
||||
%"44" = load i32, ptr %"31", align 4
|
||||
store i32 %"44", ptr addrspace(5) %"38", align 4
|
||||
%"46" = load i32, ptr addrspace(5) %"37", align 4
|
||||
%"47" = load i32, ptr addrspace(5) %"38", align 4
|
||||
%"45" = call i32 @llvm.smin.i32(i32 %"46", i32 %"47")
|
||||
store i32 %"45", ptr addrspace(5) %"37", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"49" = load i32, ptr addrspace(5) %"37", align 4
|
||||
%"52" = inttoptr i64 %"48" to ptr
|
||||
store i32 %"49", ptr %"52", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare i32 @llvm.smin.i32(i32, i32) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
@ -1,38 +1,29 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @mov(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
define amdgpu_kernel void @mov(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"49" = inttoptr i64 %"44" to ptr
|
||||
%"43" = load i64, ptr %"49", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"39", align 4
|
||||
store i64 %"46", ptr addrspace(5) %"40", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"50" = inttoptr i64 %"47" to ptr
|
||||
store i64 %"48", ptr %"50", align 4
|
||||
br label %"30"
|
||||
|
||||
"30": ; preds = %1
|
||||
%"37" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"37", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
||||
%"40" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"45" = inttoptr i64 %"40" to ptr
|
||||
%"39" = load i64, ptr %"45", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"36", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"46" = inttoptr i64 %"43" to ptr
|
||||
store i64 %"44", ptr %"46", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,24 +1,15 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @mov_address(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
|
||||
%"9" = alloca [8 x i8], align 1, addrspace(5)
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
define amdgpu_kernel void @mov_address(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #0 {
|
||||
%"10" = alloca [8 x i8], align 1, addrspace(5)
|
||||
%"31" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"37" = ptrtoint ptr addrspace(5) %"9" to i64
|
||||
store i64 %"37", ptr addrspace(5) %"35", align 4
|
||||
br label %"28"
|
||||
|
||||
"28": ; preds = %1
|
||||
%"33" = ptrtoint ptr addrspace(5) %"10" to i64
|
||||
store i64 %"33", ptr addrspace(5) %"31", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,43 +1,34 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @mul24(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i32, align 4, addrspace(5)
|
||||
%"41" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @mul24(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i32, align 4, addrspace(5)
|
||||
%"37" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"50" = inttoptr i64 %"45" to ptr
|
||||
%"44" = load i32, ptr %"50", align 4
|
||||
store i32 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"47" = load i32, ptr addrspace(5) %"40", align 4
|
||||
%"46" = call i32 @llvm.amdgcn.mul.u24(i32 %"47", i32 2)
|
||||
store i32 %"46", ptr addrspace(5) %"41", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"49" = load i32, ptr addrspace(5) %"41", align 4
|
||||
%"51" = inttoptr i64 %"48" to ptr
|
||||
store i32 %"49", ptr %"51", align 4
|
||||
br label %"31"
|
||||
|
||||
"31": ; preds = %1
|
||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"46" = inttoptr i64 %"41" to ptr
|
||||
%"40" = load i32, ptr %"46", align 4
|
||||
store i32 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"43" = load i32, ptr addrspace(5) %"36", align 4
|
||||
%"42" = call i32 @llvm.amdgcn.mul.u24(i32 %"43", i32 2)
|
||||
store i32 %"42", ptr addrspace(5) %"37", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"45" = load i32, ptr addrspace(5) %"37", align 4
|
||||
%"47" = inttoptr i64 %"44" to ptr
|
||||
store i32 %"45", ptr %"47", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare i32 @llvm.amdgcn.mul.u24(i32, i32) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
@ -1,45 +1,36 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @mul_ftz(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca float, align 4, addrspace(5)
|
||||
%"42" = alloca float, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @mul_ftz(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca float, align 4, addrspace(5)
|
||||
%"38" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"44" = load i64, ptr addrspace(4) %"38", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"54" = inttoptr i64 %"46" to ptr
|
||||
%"45" = load float, ptr %"54", align 4
|
||||
store float %"45", ptr addrspace(5) %"41", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"55" = inttoptr i64 %"47" to ptr
|
||||
%"30" = getelementptr inbounds i8, ptr %"55", i64 4
|
||||
%"48" = load float, ptr %"30", align 4
|
||||
store float %"48", ptr addrspace(5) %"42", align 4
|
||||
%"50" = load float, ptr addrspace(5) %"41", align 4
|
||||
%"51" = load float, ptr addrspace(5) %"42", align 4
|
||||
%"49" = fmul float %"50", %"51"
|
||||
store float %"49", ptr addrspace(5) %"41", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"53" = load float, ptr addrspace(5) %"41", align 4
|
||||
%"56" = inttoptr i64 %"52" to ptr
|
||||
store float %"53", ptr %"56", align 4
|
||||
br label %"32"
|
||||
|
||||
"32": ; preds = %1
|
||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"50" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load float, ptr %"50", align 4
|
||||
store float %"41", ptr addrspace(5) %"37", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"51" = inttoptr i64 %"43" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
|
||||
%"44" = load float, ptr %"31", align 4
|
||||
store float %"44", ptr addrspace(5) %"38", align 4
|
||||
%"46" = load float, ptr addrspace(5) %"37", align 4
|
||||
%"47" = load float, ptr addrspace(5) %"38", align 4
|
||||
%"45" = fmul float %"46", %"47"
|
||||
store float %"45", ptr addrspace(5) %"37", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"49" = load float, ptr addrspace(5) %"37", align 4
|
||||
%"52" = inttoptr i64 %"48" to ptr
|
||||
store float %"49", ptr %"52", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,42 +1,33 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @mul_hi(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
define amdgpu_kernel void @mul_hi(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"50" = inttoptr i64 %"45" to ptr
|
||||
%"44" = load i64, ptr %"50", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%2 = zext i64 %"47" to i128
|
||||
br label %"31"
|
||||
|
||||
"31": ; preds = %1
|
||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"46" = inttoptr i64 %"41" to ptr
|
||||
%"40" = load i64, ptr %"46", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%2 = zext i64 %"43" to i128
|
||||
%3 = mul i128 %2, 2
|
||||
%4 = lshr i128 %3, 64
|
||||
%"46" = trunc i128 %4 to i64
|
||||
store i64 %"46", ptr addrspace(5) %"41", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"51" = inttoptr i64 %"48" to ptr
|
||||
store i64 %"49", ptr %"51", align 4
|
||||
%"42" = trunc i128 %4 to i64
|
||||
store i64 %"42", ptr addrspace(5) %"37", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"47" = inttoptr i64 %"44" to ptr
|
||||
store i64 %"45", ptr %"47", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,39 +1,30 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @mul_lo(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
define amdgpu_kernel void @mul_lo(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"50" = inttoptr i64 %"45" to ptr
|
||||
%"44" = load i64, ptr %"50", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"46" = mul i64 %"47", 2
|
||||
store i64 %"46", ptr addrspace(5) %"41", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"51" = inttoptr i64 %"48" to ptr
|
||||
store i64 %"49", ptr %"51", align 4
|
||||
br label %"31"
|
||||
|
||||
"31": ; preds = %1
|
||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"46" = inttoptr i64 %"41" to ptr
|
||||
%"40" = load i64, ptr %"46", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"42" = mul i64 %"43", 2
|
||||
store i64 %"42", ptr addrspace(5) %"37", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"47" = inttoptr i64 %"44" to ptr
|
||||
store i64 %"45", ptr %"47", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,45 +1,36 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @mul_non_ftz(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca float, align 4, addrspace(5)
|
||||
%"42" = alloca float, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @mul_non_ftz(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca float, align 4, addrspace(5)
|
||||
%"38" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"44" = load i64, ptr addrspace(4) %"38", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"54" = inttoptr i64 %"46" to ptr
|
||||
%"45" = load float, ptr %"54", align 4
|
||||
store float %"45", ptr addrspace(5) %"41", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"55" = inttoptr i64 %"47" to ptr
|
||||
%"30" = getelementptr inbounds i8, ptr %"55", i64 4
|
||||
%"48" = load float, ptr %"30", align 4
|
||||
store float %"48", ptr addrspace(5) %"42", align 4
|
||||
%"50" = load float, ptr addrspace(5) %"41", align 4
|
||||
%"51" = load float, ptr addrspace(5) %"42", align 4
|
||||
%"49" = fmul float %"50", %"51"
|
||||
store float %"49", ptr addrspace(5) %"41", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"53" = load float, ptr addrspace(5) %"41", align 4
|
||||
%"56" = inttoptr i64 %"52" to ptr
|
||||
store float %"53", ptr %"56", align 4
|
||||
br label %"32"
|
||||
|
||||
"32": ; preds = %1
|
||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"50" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load float, ptr %"50", align 4
|
||||
store float %"41", ptr addrspace(5) %"37", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"51" = inttoptr i64 %"43" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
|
||||
%"44" = load float, ptr %"31", align 4
|
||||
store float %"44", ptr addrspace(5) %"38", align 4
|
||||
%"46" = load float, ptr addrspace(5) %"37", align 4
|
||||
%"47" = load float, ptr addrspace(5) %"38", align 4
|
||||
%"45" = fmul float %"46", %"47"
|
||||
store float %"45", ptr addrspace(5) %"37", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"49" = load float, ptr addrspace(5) %"37", align 4
|
||||
%"52" = inttoptr i64 %"48" to ptr
|
||||
store float %"49", ptr %"52", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,48 +1,39 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @mul_wide(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 {
|
||||
define amdgpu_kernel void @mul_wide(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i32, align 4, addrspace(5)
|
||||
%"39" = alloca i32, align 4, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
%"42" = alloca i32, align 4, addrspace(5)
|
||||
%"43" = alloca i32, align 4, addrspace(5)
|
||||
%"44" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"45" = load i64, ptr addrspace(4) %"38", align 4
|
||||
store i64 %"45", ptr addrspace(5) %"40", align 4
|
||||
%"46" = load i64, ptr addrspace(4) %"39", align 4
|
||||
store i64 %"46", ptr addrspace(5) %"41", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"56" = inttoptr i64 %"48" to ptr addrspace(1)
|
||||
%"47" = load i32, ptr addrspace(1) %"56", align 4
|
||||
store i32 %"47", ptr addrspace(5) %"42", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"57" = inttoptr i64 %"49" to ptr addrspace(1)
|
||||
%"31" = getelementptr inbounds i8, ptr addrspace(1) %"57", i64 4
|
||||
%"50" = load i32, ptr addrspace(1) %"31", align 4
|
||||
store i32 %"50", ptr addrspace(5) %"43", align 4
|
||||
%"52" = load i32, ptr addrspace(5) %"42", align 4
|
||||
%"53" = load i32, ptr addrspace(5) %"43", align 4
|
||||
%2 = sext i32 %"52" to i64
|
||||
%3 = sext i32 %"53" to i64
|
||||
%"51" = mul i64 %2, %3
|
||||
store i64 %"51", ptr addrspace(5) %"44", align 4
|
||||
%"54" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"55" = load i64, ptr addrspace(5) %"44", align 4
|
||||
%"58" = inttoptr i64 %"54" to ptr
|
||||
store i64 %"55", ptr %"58", align 4
|
||||
br label %"33"
|
||||
|
||||
"33": ; preds = %1
|
||||
%"41" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"36", align 4
|
||||
%"42" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"37", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"52" = inttoptr i64 %"44" to ptr addrspace(1)
|
||||
%"43" = load i32, ptr addrspace(1) %"52", align 4
|
||||
store i32 %"43", ptr addrspace(5) %"38", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"53" = inttoptr i64 %"45" to ptr addrspace(1)
|
||||
%"32" = getelementptr inbounds i8, ptr addrspace(1) %"53", i64 4
|
||||
%"46" = load i32, ptr addrspace(1) %"32", align 4
|
||||
store i32 %"46", ptr addrspace(5) %"39", align 4
|
||||
%"48" = load i32, ptr addrspace(5) %"38", align 4
|
||||
%"49" = load i32, ptr addrspace(5) %"39", align 4
|
||||
%2 = sext i32 %"48" to i64
|
||||
%3 = sext i32 %"49" to i64
|
||||
%"47" = mul i64 %2, %3
|
||||
store i64 %"47", ptr addrspace(5) %"40", align 4
|
||||
%"50" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"51" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"54" = inttoptr i64 %"50" to ptr
|
||||
store i64 %"51", ptr %"54", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,38 +1,29 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @neg(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @neg(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
||||
%"32" = alloca i64, align 8, addrspace(5)
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"39" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"36", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"47" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load i32, ptr %"47", align 4
|
||||
store i32 %"41", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load i32, ptr addrspace(5) %"38", align 4
|
||||
%"43" = sub i32 0, %"44"
|
||||
store i32 %"43", ptr addrspace(5) %"38", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"46" = load i32, ptr addrspace(5) %"38", align 4
|
||||
%"48" = inttoptr i64 %"45" to ptr
|
||||
store i32 %"46", ptr %"48", align 4
|
||||
br label %"29"
|
||||
|
||||
"29": ; preds = %1
|
||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
||||
%"43" = inttoptr i64 %"38" to ptr
|
||||
%"37" = load i32, ptr %"43", align 4
|
||||
store i32 %"37", ptr addrspace(5) %"34", align 4
|
||||
%"40" = load i32, ptr addrspace(5) %"34", align 4
|
||||
%"39" = sub i32 0, %"40"
|
||||
store i32 %"39", ptr addrspace(5) %"34", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"42" = load i32, ptr addrspace(5) %"34", align 4
|
||||
%"44" = inttoptr i64 %"41" to ptr
|
||||
store i32 %"42", ptr %"44", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,44 +1,35 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @non_scalar_ptr_offset(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 {
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
%"42" = alloca i32, align 4, addrspace(5)
|
||||
%"43" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @non_scalar_ptr_offset(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i32, align 4, addrspace(5)
|
||||
%"39" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"44" = load i64, ptr addrspace(4) %"38", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"45" = load i64, ptr addrspace(4) %"39", align 4
|
||||
store i64 %"45", ptr addrspace(5) %"41", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"54" = inttoptr i64 %"46" to ptr addrspace(1)
|
||||
%"31" = getelementptr inbounds i8, ptr addrspace(1) %"54", i64 8
|
||||
%"29" = load <2 x i32>, ptr addrspace(1) %"31", align 8
|
||||
%"47" = extractelement <2 x i32> %"29", i8 0
|
||||
%"48" = extractelement <2 x i32> %"29", i8 1
|
||||
store i32 %"47", ptr addrspace(5) %"42", align 4
|
||||
store i32 %"48", ptr addrspace(5) %"43", align 4
|
||||
%"50" = load i32, ptr addrspace(5) %"42", align 4
|
||||
%"51" = load i32, ptr addrspace(5) %"43", align 4
|
||||
%"49" = add i32 %"50", %"51"
|
||||
store i32 %"49", ptr addrspace(5) %"42", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"53" = load i32, ptr addrspace(5) %"42", align 4
|
||||
%"55" = inttoptr i64 %"52" to ptr addrspace(1)
|
||||
store i32 %"53", ptr addrspace(1) %"55", align 4
|
||||
br label %"33"
|
||||
|
||||
"33": ; preds = %1
|
||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"50" = inttoptr i64 %"42" to ptr addrspace(1)
|
||||
%"32" = getelementptr inbounds i8, ptr addrspace(1) %"50", i64 8
|
||||
%"30" = load <2 x i32>, ptr addrspace(1) %"32", align 8
|
||||
%"43" = extractelement <2 x i32> %"30", i8 0
|
||||
%"44" = extractelement <2 x i32> %"30", i8 1
|
||||
store i32 %"43", ptr addrspace(5) %"38", align 4
|
||||
store i32 %"44", ptr addrspace(5) %"39", align 4
|
||||
%"46" = load i32, ptr addrspace(5) %"38", align 4
|
||||
%"47" = load i32, ptr addrspace(5) %"39", align 4
|
||||
%"45" = add i32 %"46", %"47"
|
||||
store i32 %"45", ptr addrspace(5) %"38", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"49" = load i32, ptr addrspace(5) %"38", align 4
|
||||
%"51" = inttoptr i64 %"48" to ptr addrspace(1)
|
||||
store i32 %"49", ptr addrspace(1) %"51", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,39 +1,30 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @not(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
define amdgpu_kernel void @not(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"49" = inttoptr i64 %"44" to ptr
|
||||
%"43" = load i64, ptr %"49", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"50" = xor i64 %"46", -1
|
||||
store i64 %"50", ptr addrspace(5) %"40", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"52" = inttoptr i64 %"47" to ptr
|
||||
store i64 %"48", ptr %"52", align 4
|
||||
br label %"30"
|
||||
|
||||
"30": ; preds = %1
|
||||
%"37" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"37", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
||||
%"40" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"45" = inttoptr i64 %"40" to ptr
|
||||
%"39" = load i64, ptr %"45", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"46" = xor i64 %"42", -1
|
||||
store i64 %"46", ptr addrspace(5) %"36", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"48" = inttoptr i64 %"43" to ptr
|
||||
store i64 %"44", ptr %"48", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,42 +1,39 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @ntid(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i32, align 4, addrspace(5)
|
||||
%"42" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @ntid(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 {
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i32, align 4, addrspace(5)
|
||||
%"40" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"44" = load i64, ptr addrspace(4) %"38", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"53" = inttoptr i64 %"46" to ptr
|
||||
%"45" = load i32, ptr %"53", align 4
|
||||
store i32 %"45", ptr addrspace(5) %"41", align 4
|
||||
%"30" = call i32 @__zluda_ptx_impl_sreg_ntid(i8 0)
|
||||
store i32 %"30", ptr addrspace(5) %"42", align 4
|
||||
%"49" = load i32, ptr addrspace(5) %"41", align 4
|
||||
%"50" = load i32, ptr addrspace(5) %"42", align 4
|
||||
%"48" = add i32 %"49", %"50"
|
||||
store i32 %"48", ptr addrspace(5) %"41", align 4
|
||||
%"51" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"52" = load i32, ptr addrspace(5) %"41", align 4
|
||||
%"54" = inttoptr i64 %"51" to ptr
|
||||
store i32 %"52", ptr %"54", align 4
|
||||
br label %"32"
|
||||
|
||||
"32": ; preds = %1
|
||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"51" = inttoptr i64 %"44" to ptr
|
||||
%"43" = load i32, ptr %"51", align 4
|
||||
store i32 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"31" = call i32 @__zluda_ptx_impl_sreg_ntid(i8 0)
|
||||
br label %"33"
|
||||
|
||||
"33": ; preds = %"32"
|
||||
store i32 %"31", ptr addrspace(5) %"40", align 4
|
||||
%"47" = load i32, ptr addrspace(5) %"39", align 4
|
||||
%"48" = load i32, ptr addrspace(5) %"40", align 4
|
||||
%"46" = add i32 %"47", %"48"
|
||||
store i32 %"46", ptr addrspace(5) %"39", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"50" = load i32, ptr addrspace(5) %"39", align 4
|
||||
%"52" = inttoptr i64 %"49" to ptr
|
||||
store i32 %"50", ptr %"52", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,45 +1,36 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @or(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
%"42" = alloca i64, align 8, addrspace(5)
|
||||
define amdgpu_kernel void @or(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"44" = load i64, ptr addrspace(4) %"38", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"54" = inttoptr i64 %"46" to ptr
|
||||
%"45" = load i64, ptr %"54", align 4
|
||||
store i64 %"45", ptr addrspace(5) %"41", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"55" = inttoptr i64 %"47" to ptr
|
||||
%"30" = getelementptr inbounds i8, ptr %"55", i64 8
|
||||
%"48" = load i64, ptr %"30", align 4
|
||||
store i64 %"48", ptr addrspace(5) %"42", align 4
|
||||
%"50" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"51" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"56" = or i64 %"50", %"51"
|
||||
store i64 %"56", ptr addrspace(5) %"41", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"53" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"59" = inttoptr i64 %"52" to ptr
|
||||
store i64 %"53", ptr %"59", align 4
|
||||
br label %"32"
|
||||
|
||||
"32": ; preds = %1
|
||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"50" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load i64, ptr %"50", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"51" = inttoptr i64 %"43" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"51", i64 8
|
||||
%"44" = load i64, ptr %"31", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"38", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"52" = or i64 %"46", %"47"
|
||||
store i64 %"52", ptr addrspace(5) %"37", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"55" = inttoptr i64 %"48" to ptr
|
||||
store i64 %"49", ptr %"55", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,42 +1,33 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @popc(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @popc(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
||||
%"32" = alloca i64, align 8, addrspace(5)
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"39" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"36", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"47" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load i32, ptr %"47", align 4
|
||||
store i32 %"41", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load i32, ptr addrspace(5) %"38", align 4
|
||||
%"48" = call i32 @llvm.ctpop.i32(i32 %"44")
|
||||
store i32 %"48", ptr addrspace(5) %"38", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"46" = load i32, ptr addrspace(5) %"38", align 4
|
||||
%"49" = inttoptr i64 %"45" to ptr
|
||||
store i32 %"46", ptr %"49", align 4
|
||||
br label %"29"
|
||||
|
||||
"29": ; preds = %1
|
||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
||||
%"43" = inttoptr i64 %"38" to ptr
|
||||
%"37" = load i32, ptr %"43", align 4
|
||||
store i32 %"37", ptr addrspace(5) %"34", align 4
|
||||
%"40" = load i32, ptr addrspace(5) %"34", align 4
|
||||
%"44" = call i32 @llvm.ctpop.i32(i32 %"40")
|
||||
store i32 %"44", ptr addrspace(5) %"34", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"42" = load i32, ptr addrspace(5) %"34", align 4
|
||||
%"45" = inttoptr i64 %"41" to ptr
|
||||
store i32 %"42", ptr %"45", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare i32 @llvm.ctpop.i32(i32) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
@ -1,66 +1,57 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @pred_not(ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #0 {
|
||||
define amdgpu_kernel void @pred_not(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 {
|
||||
%"43" = alloca i64, align 8, addrspace(5)
|
||||
%"44" = alloca i64, align 8, addrspace(5)
|
||||
%"45" = alloca i64, align 8, addrspace(5)
|
||||
%"46" = alloca i64, align 8, addrspace(5)
|
||||
%"47" = alloca i64, align 8, addrspace(5)
|
||||
%"48" = alloca i64, align 8, addrspace(5)
|
||||
%"49" = alloca i64, align 8, addrspace(5)
|
||||
%"50" = alloca i64, align 8, addrspace(5)
|
||||
%"51" = alloca i64, align 8, addrspace(5)
|
||||
%"52" = alloca i1, align 1, addrspace(5)
|
||||
%"48" = alloca i1, align 1, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"53" = load i64, ptr addrspace(4) %"45", align 4
|
||||
store i64 %"53", ptr addrspace(5) %"47", align 4
|
||||
%"54" = load i64, ptr addrspace(4) %"46", align 4
|
||||
store i64 %"54", ptr addrspace(5) %"48", align 4
|
||||
%"56" = load i64, ptr addrspace(5) %"47", align 4
|
||||
%"70" = inttoptr i64 %"56" to ptr
|
||||
%"55" = load i64, ptr %"70", align 4
|
||||
store i64 %"55", ptr addrspace(5) %"49", align 4
|
||||
%"57" = load i64, ptr addrspace(5) %"47", align 4
|
||||
%"71" = inttoptr i64 %"57" to ptr
|
||||
%"36" = getelementptr inbounds i8, ptr %"71", i64 8
|
||||
%"58" = load i64, ptr %"36", align 4
|
||||
store i64 %"58", ptr addrspace(5) %"50", align 4
|
||||
%"60" = load i64, ptr addrspace(5) %"49", align 4
|
||||
%"61" = load i64, ptr addrspace(5) %"50", align 4
|
||||
%"59" = icmp ult i64 %"60", %"61"
|
||||
store i1 %"59", ptr addrspace(5) %"52", align 1
|
||||
%"63" = load i1, ptr addrspace(5) %"52", align 1
|
||||
%"62" = xor i1 %"63", true
|
||||
store i1 %"62", ptr addrspace(5) %"52", align 1
|
||||
%"64" = load i1, ptr addrspace(5) %"52", align 1
|
||||
br i1 %"64", label %"15", label %"16"
|
||||
br label %"40"
|
||||
|
||||
"15": ; preds = %1
|
||||
store i64 1, ptr addrspace(5) %"51", align 4
|
||||
br label %"16"
|
||||
"40": ; preds = %1
|
||||
%"49" = load i64, ptr addrspace(4) %"41", align 4
|
||||
store i64 %"49", ptr addrspace(5) %"43", align 4
|
||||
%"50" = load i64, ptr addrspace(4) %"42", align 4
|
||||
store i64 %"50", ptr addrspace(5) %"44", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"43", align 4
|
||||
%"66" = inttoptr i64 %"52" to ptr
|
||||
%"51" = load i64, ptr %"66", align 4
|
||||
store i64 %"51", ptr addrspace(5) %"45", align 4
|
||||
%"53" = load i64, ptr addrspace(5) %"43", align 4
|
||||
%"67" = inttoptr i64 %"53" to ptr
|
||||
%"37" = getelementptr inbounds i8, ptr %"67", i64 8
|
||||
%"54" = load i64, ptr %"37", align 4
|
||||
store i64 %"54", ptr addrspace(5) %"46", align 4
|
||||
%"56" = load i64, ptr addrspace(5) %"45", align 4
|
||||
%"57" = load i64, ptr addrspace(5) %"46", align 4
|
||||
%"55" = icmp ult i64 %"56", %"57"
|
||||
store i1 %"55", ptr addrspace(5) %"48", align 1
|
||||
%"59" = load i1, ptr addrspace(5) %"48", align 1
|
||||
%"58" = xor i1 %"59", true
|
||||
store i1 %"58", ptr addrspace(5) %"48", align 1
|
||||
%"60" = load i1, ptr addrspace(5) %"48", align 1
|
||||
br i1 %"60", label %"16", label %"17"
|
||||
|
||||
"16": ; preds = %"15", %1
|
||||
%"66" = load i1, ptr addrspace(5) %"52", align 1
|
||||
br i1 %"66", label %"18", label %"17"
|
||||
"16": ; preds = %"40"
|
||||
store i64 1, ptr addrspace(5) %"47", align 4
|
||||
br label %"17"
|
||||
|
||||
"17": ; preds = %"16"
|
||||
store i64 2, ptr addrspace(5) %"51", align 4
|
||||
br label %"18"
|
||||
"17": ; preds = %"16", %"40"
|
||||
%"62" = load i1, ptr addrspace(5) %"48", align 1
|
||||
br i1 %"62", label %"19", label %"18"
|
||||
|
||||
"18": ; preds = %"17", %"16"
|
||||
%"68" = load i64, ptr addrspace(5) %"48", align 4
|
||||
%"69" = load i64, ptr addrspace(5) %"51", align 4
|
||||
%"72" = inttoptr i64 %"68" to ptr
|
||||
store i64 %"69", ptr %"72", align 4
|
||||
"18": ; preds = %"17"
|
||||
store i64 2, ptr addrspace(5) %"47", align 4
|
||||
br label %"19"
|
||||
|
||||
"19": ; preds = %"18", %"17"
|
||||
%"64" = load i64, ptr addrspace(5) %"44", align 4
|
||||
%"65" = load i64, ptr addrspace(5) %"47", align 4
|
||||
%"68" = inttoptr i64 %"64" to ptr
|
||||
store i64 %"65", ptr %"68", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,47 +1,38 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i32, align 4, addrspace(5)
|
||||
%"42" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i32, align 4, addrspace(5)
|
||||
%"38" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"44" = load i64, ptr addrspace(4) %"38", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"54" = inttoptr i64 %"46" to ptr
|
||||
%"45" = load i32, ptr %"54", align 4
|
||||
store i32 %"45", ptr addrspace(5) %"41", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"55" = inttoptr i64 %"47" to ptr
|
||||
%"30" = getelementptr inbounds i8, ptr %"55", i64 4
|
||||
%"48" = load i32, ptr %"30", align 4
|
||||
store i32 %"48", ptr addrspace(5) %"42", align 4
|
||||
%"50" = load i32, ptr addrspace(5) %"41", align 4
|
||||
%"51" = load i32, ptr addrspace(5) %"42", align 4
|
||||
%2 = bitcast i32 %"50" to <4 x i8>
|
||||
%3 = bitcast i32 %"51" to <4 x i8>
|
||||
%"56" = shufflevector <4 x i8> %2, <4 x i8> %3, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
|
||||
store <4 x i8> %"56", ptr addrspace(5) %"42", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"53" = load i32, ptr addrspace(5) %"42", align 4
|
||||
%"59" = inttoptr i64 %"52" to ptr
|
||||
store i32 %"53", ptr %"59", align 4
|
||||
br label %"32"
|
||||
|
||||
"32": ; preds = %1
|
||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"50" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load i32, ptr %"50", align 4
|
||||
store i32 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"51" = inttoptr i64 %"43" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
|
||||
%"44" = load i32, ptr %"31", align 4
|
||||
store i32 %"44", ptr addrspace(5) %"38", align 4
|
||||
%"46" = load i32, ptr addrspace(5) %"37", align 4
|
||||
%"47" = load i32, ptr addrspace(5) %"38", align 4
|
||||
%2 = bitcast i32 %"46" to <4 x i8>
|
||||
%3 = bitcast i32 %"47" to <4 x i8>
|
||||
%"52" = shufflevector <4 x i8> %2, <4 x i8> %3, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
|
||||
store <4 x i8> %"52", ptr addrspace(5) %"38", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"49" = load i32, ptr addrspace(5) %"38", align 4
|
||||
%"55" = inttoptr i64 %"48" to ptr
|
||||
store i32 %"49", ptr %"55", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,42 +1,33 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @rcp(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca float, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @rcp(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
||||
%"32" = alloca i64, align 8, addrspace(5)
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"39" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"36", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"47" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load float, ptr %"47", align 4
|
||||
store float %"41", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load float, ptr addrspace(5) %"38", align 4
|
||||
%"43" = call float @llvm.amdgcn.rcp.f32(float %"44")
|
||||
store float %"43", ptr addrspace(5) %"38", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"46" = load float, ptr addrspace(5) %"38", align 4
|
||||
%"48" = inttoptr i64 %"45" to ptr
|
||||
store float %"46", ptr %"48", align 4
|
||||
br label %"29"
|
||||
|
||||
"29": ; preds = %1
|
||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
||||
%"43" = inttoptr i64 %"38" to ptr
|
||||
%"37" = load float, ptr %"43", align 4
|
||||
store float %"37", ptr addrspace(5) %"34", align 4
|
||||
%"40" = load float, ptr addrspace(5) %"34", align 4
|
||||
%"39" = call float @llvm.amdgcn.rcp.f32(float %"40")
|
||||
store float %"39", ptr addrspace(5) %"34", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"42" = load float, ptr addrspace(5) %"34", align 4
|
||||
%"44" = inttoptr i64 %"41" to ptr
|
||||
store float %"42", ptr %"44", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare float @llvm.amdgcn.rcp.f32(float) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
@ -1,45 +1,36 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @reg_local(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 {
|
||||
%"9" = alloca [8 x i8], align 8, addrspace(5)
|
||||
%"43" = alloca i64, align 8, addrspace(5)
|
||||
%"44" = alloca i64, align 8, addrspace(5)
|
||||
%"45" = alloca i64, align 8, addrspace(5)
|
||||
define amdgpu_kernel void @reg_local(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
|
||||
%"10" = alloca [8 x i8], align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"46" = load i64, ptr addrspace(4) %"41", align 4
|
||||
store i64 %"46", ptr addrspace(5) %"43", align 4
|
||||
%"47" = load i64, ptr addrspace(4) %"42", align 4
|
||||
store i64 %"47", ptr addrspace(5) %"44", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"43", align 4
|
||||
%"55" = inttoptr i64 %"49" to ptr addrspace(1)
|
||||
%"54" = load i64, ptr addrspace(1) %"55", align 4
|
||||
store i64 %"54", ptr addrspace(5) %"45", align 4
|
||||
%"50" = load i64, ptr addrspace(5) %"45", align 4
|
||||
%"30" = add i64 %"50", 1
|
||||
%"56" = addrspacecast ptr addrspace(5) %"9" to ptr
|
||||
store i64 %"30", ptr %"56", align 4
|
||||
%"58" = addrspacecast ptr addrspace(5) %"9" to ptr
|
||||
%"32" = getelementptr inbounds i8, ptr %"58", i64 0
|
||||
%"59" = load i64, ptr %"32", align 4
|
||||
store i64 %"59", ptr addrspace(5) %"45", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"44", align 4
|
||||
%"60" = inttoptr i64 %"52" to ptr addrspace(1)
|
||||
%"34" = getelementptr inbounds i8, ptr addrspace(1) %"60", i64 0
|
||||
%"53" = load i64, ptr addrspace(5) %"45", align 4
|
||||
store i64 %"53", ptr addrspace(1) %"34", align 4
|
||||
br label %"36"
|
||||
|
||||
"36": ; preds = %1
|
||||
%"42" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"39", align 4
|
||||
%"43" = load i64, ptr addrspace(4) %"38", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"40", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"51" = inttoptr i64 %"45" to ptr addrspace(1)
|
||||
%"50" = load i64, ptr addrspace(1) %"51", align 4
|
||||
store i64 %"50", ptr addrspace(5) %"41", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"31" = add i64 %"46", 1
|
||||
%"52" = addrspacecast ptr addrspace(5) %"10" to ptr
|
||||
store i64 %"31", ptr %"52", align 4
|
||||
%"54" = addrspacecast ptr addrspace(5) %"10" to ptr
|
||||
%"33" = getelementptr inbounds i8, ptr %"54", i64 0
|
||||
%"55" = load i64, ptr %"33", align 4
|
||||
store i64 %"55", ptr addrspace(5) %"41", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"56" = inttoptr i64 %"48" to ptr addrspace(1)
|
||||
%"35" = getelementptr inbounds i8, ptr addrspace(1) %"56", i64 0
|
||||
%"49" = load i64, ptr addrspace(5) %"41", align 4
|
||||
store i64 %"49", ptr addrspace(1) %"35", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,45 +1,36 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @rem(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i32, align 4, addrspace(5)
|
||||
%"42" = alloca i32, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @rem(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i32, align 4, addrspace(5)
|
||||
%"38" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"44" = load i64, ptr addrspace(4) %"38", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"54" = inttoptr i64 %"46" to ptr
|
||||
%"45" = load i32, ptr %"54", align 4
|
||||
store i32 %"45", ptr addrspace(5) %"41", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"55" = inttoptr i64 %"47" to ptr
|
||||
%"30" = getelementptr inbounds i8, ptr %"55", i64 4
|
||||
%"48" = load i32, ptr %"30", align 4
|
||||
store i32 %"48", ptr addrspace(5) %"42", align 4
|
||||
%"50" = load i32, ptr addrspace(5) %"41", align 4
|
||||
%"51" = load i32, ptr addrspace(5) %"42", align 4
|
||||
%"49" = srem i32 %"50", %"51"
|
||||
store i32 %"49", ptr addrspace(5) %"41", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"53" = load i32, ptr addrspace(5) %"41", align 4
|
||||
%"56" = inttoptr i64 %"52" to ptr
|
||||
store i32 %"53", ptr %"56", align 4
|
||||
br label %"32"
|
||||
|
||||
"32": ; preds = %1
|
||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"50" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load i32, ptr %"50", align 4
|
||||
store i32 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"51" = inttoptr i64 %"43" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
|
||||
%"44" = load i32, ptr %"31", align 4
|
||||
store i32 %"44", ptr addrspace(5) %"38", align 4
|
||||
%"46" = load i32, ptr addrspace(5) %"37", align 4
|
||||
%"47" = load i32, ptr addrspace(5) %"38", align 4
|
||||
%"45" = srem i32 %"46", %"47"
|
||||
store i32 %"45", ptr addrspace(5) %"37", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"49" = load i32, ptr addrspace(5) %"37", align 4
|
||||
%"52" = inttoptr i64 %"48" to ptr
|
||||
store i32 %"49", ptr %"52", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,42 +1,33 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @rsqrt(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca double, align 8, addrspace(5)
|
||||
define amdgpu_kernel void @rsqrt(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
||||
%"32" = alloca i64, align 8, addrspace(5)
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca double, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"39" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"36", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"47" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load double, ptr %"47", align 8
|
||||
store double %"41", ptr addrspace(5) %"38", align 8
|
||||
%"44" = load double, ptr addrspace(5) %"38", align 8
|
||||
%"43" = call double @llvm.amdgcn.rsq.f64(double %"44")
|
||||
store double %"43", ptr addrspace(5) %"38", align 8
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"46" = load double, ptr addrspace(5) %"38", align 8
|
||||
%"48" = inttoptr i64 %"45" to ptr
|
||||
store double %"46", ptr %"48", align 8
|
||||
br label %"29"
|
||||
|
||||
"29": ; preds = %1
|
||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
||||
%"43" = inttoptr i64 %"38" to ptr
|
||||
%"37" = load double, ptr %"43", align 8
|
||||
store double %"37", ptr addrspace(5) %"34", align 8
|
||||
%"40" = load double, ptr addrspace(5) %"34", align 8
|
||||
%"39" = call double @llvm.amdgcn.rsq.f64(double %"40")
|
||||
store double %"39", ptr addrspace(5) %"34", align 8
|
||||
%"41" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"42" = load double, ptr addrspace(5) %"34", align 8
|
||||
%"44" = inttoptr i64 %"41" to ptr
|
||||
store double %"42", ptr %"44", align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare double @llvm.amdgcn.rsq.f64(double) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
@ -1,45 +1,36 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @selp(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 {
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
%"42" = alloca i16, align 2, addrspace(5)
|
||||
%"43" = alloca i16, align 2, addrspace(5)
|
||||
define amdgpu_kernel void @selp(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i16, align 2, addrspace(5)
|
||||
%"39" = alloca i16, align 2, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"44" = load i64, ptr addrspace(4) %"38", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"45" = load i64, ptr addrspace(4) %"39", align 4
|
||||
store i64 %"45", ptr addrspace(5) %"41", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"55" = inttoptr i64 %"47" to ptr
|
||||
%"46" = load i16, ptr %"55", align 2
|
||||
store i16 %"46", ptr addrspace(5) %"42", align 2
|
||||
%"48" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"56" = inttoptr i64 %"48" to ptr
|
||||
%"30" = getelementptr inbounds i8, ptr %"56", i64 2
|
||||
%"49" = load i16, ptr %"30", align 2
|
||||
store i16 %"49", ptr addrspace(5) %"43", align 2
|
||||
%"51" = load i16, ptr addrspace(5) %"42", align 2
|
||||
%"52" = load i16, ptr addrspace(5) %"43", align 2
|
||||
%"50" = select i1 false, i16 %"51", i16 %"52"
|
||||
store i16 %"50", ptr addrspace(5) %"42", align 2
|
||||
%"53" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"54" = load i16, ptr addrspace(5) %"42", align 2
|
||||
%"57" = inttoptr i64 %"53" to ptr
|
||||
store i16 %"54", ptr %"57", align 2
|
||||
br label %"33"
|
||||
|
||||
"33": ; preds = %1
|
||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"51" = inttoptr i64 %"43" to ptr
|
||||
%"42" = load i16, ptr %"51", align 2
|
||||
store i16 %"42", ptr addrspace(5) %"38", align 2
|
||||
%"44" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"52" = inttoptr i64 %"44" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"52", i64 2
|
||||
%"45" = load i16, ptr %"31", align 2
|
||||
store i16 %"45", ptr addrspace(5) %"39", align 2
|
||||
%"47" = load i16, ptr addrspace(5) %"38", align 2
|
||||
%"48" = load i16, ptr addrspace(5) %"39", align 2
|
||||
%"46" = select i1 false, i16 %"47", i16 %"48"
|
||||
store i16 %"46", ptr addrspace(5) %"38", align 2
|
||||
%"49" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"50" = load i16, ptr addrspace(5) %"38", align 2
|
||||
%"53" = inttoptr i64 %"49" to ptr
|
||||
store i16 %"50", ptr %"53", align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,45 +1,36 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @selp_true(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 {
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
%"42" = alloca i16, align 2, addrspace(5)
|
||||
%"43" = alloca i16, align 2, addrspace(5)
|
||||
define amdgpu_kernel void @selp_true(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i16, align 2, addrspace(5)
|
||||
%"39" = alloca i16, align 2, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"44" = load i64, ptr addrspace(4) %"38", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"45" = load i64, ptr addrspace(4) %"39", align 4
|
||||
store i64 %"45", ptr addrspace(5) %"41", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"55" = inttoptr i64 %"47" to ptr
|
||||
%"46" = load i16, ptr %"55", align 2
|
||||
store i16 %"46", ptr addrspace(5) %"42", align 2
|
||||
%"48" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"56" = inttoptr i64 %"48" to ptr
|
||||
%"30" = getelementptr inbounds i8, ptr %"56", i64 2
|
||||
%"49" = load i16, ptr %"30", align 2
|
||||
store i16 %"49", ptr addrspace(5) %"43", align 2
|
||||
%"51" = load i16, ptr addrspace(5) %"42", align 2
|
||||
%"52" = load i16, ptr addrspace(5) %"43", align 2
|
||||
%"50" = select i1 true, i16 %"51", i16 %"52"
|
||||
store i16 %"50", ptr addrspace(5) %"42", align 2
|
||||
%"53" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"54" = load i16, ptr addrspace(5) %"42", align 2
|
||||
%"57" = inttoptr i64 %"53" to ptr
|
||||
store i16 %"54", ptr %"57", align 2
|
||||
br label %"33"
|
||||
|
||||
"33": ; preds = %1
|
||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"51" = inttoptr i64 %"43" to ptr
|
||||
%"42" = load i16, ptr %"51", align 2
|
||||
store i16 %"42", ptr addrspace(5) %"38", align 2
|
||||
%"44" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"52" = inttoptr i64 %"44" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"52", i64 2
|
||||
%"45" = load i16, ptr %"31", align 2
|
||||
store i16 %"45", ptr addrspace(5) %"39", align 2
|
||||
%"47" = load i16, ptr addrspace(5) %"38", align 2
|
||||
%"48" = load i16, ptr addrspace(5) %"39", align 2
|
||||
%"46" = select i1 true, i16 %"47", i16 %"48"
|
||||
store i16 %"46", ptr addrspace(5) %"38", align 2
|
||||
%"49" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"50" = load i16, ptr addrspace(5) %"38", align 2
|
||||
%"53" = inttoptr i64 %"49" to ptr
|
||||
store i16 %"50", ptr %"53", align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,63 +1,54 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @setp(ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #0 {
|
||||
define amdgpu_kernel void @setp(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 {
|
||||
%"43" = alloca i64, align 8, addrspace(5)
|
||||
%"44" = alloca i64, align 8, addrspace(5)
|
||||
%"45" = alloca i64, align 8, addrspace(5)
|
||||
%"46" = alloca i64, align 8, addrspace(5)
|
||||
%"47" = alloca i64, align 8, addrspace(5)
|
||||
%"48" = alloca i64, align 8, addrspace(5)
|
||||
%"49" = alloca i64, align 8, addrspace(5)
|
||||
%"50" = alloca i64, align 8, addrspace(5)
|
||||
%"51" = alloca i64, align 8, addrspace(5)
|
||||
%"52" = alloca i1, align 1, addrspace(5)
|
||||
%"48" = alloca i1, align 1, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"53" = load i64, ptr addrspace(4) %"45", align 4
|
||||
store i64 %"53", ptr addrspace(5) %"47", align 4
|
||||
%"54" = load i64, ptr addrspace(4) %"46", align 4
|
||||
store i64 %"54", ptr addrspace(5) %"48", align 4
|
||||
%"56" = load i64, ptr addrspace(5) %"47", align 4
|
||||
%"68" = inttoptr i64 %"56" to ptr
|
||||
%"55" = load i64, ptr %"68", align 4
|
||||
store i64 %"55", ptr addrspace(5) %"49", align 4
|
||||
%"57" = load i64, ptr addrspace(5) %"47", align 4
|
||||
%"69" = inttoptr i64 %"57" to ptr
|
||||
%"36" = getelementptr inbounds i8, ptr %"69", i64 8
|
||||
%"58" = load i64, ptr %"36", align 4
|
||||
store i64 %"58", ptr addrspace(5) %"50", align 4
|
||||
%"60" = load i64, ptr addrspace(5) %"49", align 4
|
||||
%"61" = load i64, ptr addrspace(5) %"50", align 4
|
||||
%"59" = icmp ult i64 %"60", %"61"
|
||||
store i1 %"59", ptr addrspace(5) %"52", align 1
|
||||
%"62" = load i1, ptr addrspace(5) %"52", align 1
|
||||
br i1 %"62", label %"15", label %"16"
|
||||
br label %"40"
|
||||
|
||||
"15": ; preds = %1
|
||||
store i64 1, ptr addrspace(5) %"51", align 4
|
||||
br label %"16"
|
||||
"40": ; preds = %1
|
||||
%"49" = load i64, ptr addrspace(4) %"41", align 4
|
||||
store i64 %"49", ptr addrspace(5) %"43", align 4
|
||||
%"50" = load i64, ptr addrspace(4) %"42", align 4
|
||||
store i64 %"50", ptr addrspace(5) %"44", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"43", align 4
|
||||
%"64" = inttoptr i64 %"52" to ptr
|
||||
%"51" = load i64, ptr %"64", align 4
|
||||
store i64 %"51", ptr addrspace(5) %"45", align 4
|
||||
%"53" = load i64, ptr addrspace(5) %"43", align 4
|
||||
%"65" = inttoptr i64 %"53" to ptr
|
||||
%"37" = getelementptr inbounds i8, ptr %"65", i64 8
|
||||
%"54" = load i64, ptr %"37", align 4
|
||||
store i64 %"54", ptr addrspace(5) %"46", align 4
|
||||
%"56" = load i64, ptr addrspace(5) %"45", align 4
|
||||
%"57" = load i64, ptr addrspace(5) %"46", align 4
|
||||
%"55" = icmp ult i64 %"56", %"57"
|
||||
store i1 %"55", ptr addrspace(5) %"48", align 1
|
||||
%"58" = load i1, ptr addrspace(5) %"48", align 1
|
||||
br i1 %"58", label %"16", label %"17"
|
||||
|
||||
"16": ; preds = %"15", %1
|
||||
%"64" = load i1, ptr addrspace(5) %"52", align 1
|
||||
br i1 %"64", label %"18", label %"17"
|
||||
"16": ; preds = %"40"
|
||||
store i64 1, ptr addrspace(5) %"47", align 4
|
||||
br label %"17"
|
||||
|
||||
"17": ; preds = %"16"
|
||||
store i64 2, ptr addrspace(5) %"51", align 4
|
||||
br label %"18"
|
||||
"17": ; preds = %"16", %"40"
|
||||
%"60" = load i1, ptr addrspace(5) %"48", align 1
|
||||
br i1 %"60", label %"19", label %"18"
|
||||
|
||||
"18": ; preds = %"17", %"16"
|
||||
%"66" = load i64, ptr addrspace(5) %"48", align 4
|
||||
%"67" = load i64, ptr addrspace(5) %"51", align 4
|
||||
%"70" = inttoptr i64 %"66" to ptr
|
||||
store i64 %"67", ptr %"70", align 4
|
||||
"18": ; preds = %"17"
|
||||
store i64 2, ptr addrspace(5) %"47", align 4
|
||||
br label %"19"
|
||||
|
||||
"19": ; preds = %"18", %"17"
|
||||
%"62" = load i64, ptr addrspace(5) %"44", align 4
|
||||
%"63" = load i64, ptr addrspace(5) %"47", align 4
|
||||
%"66" = inttoptr i64 %"62" to ptr
|
||||
store i64 %"63", ptr %"66", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,65 +1,56 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @setp_gt(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44") #0 {
|
||||
%"45" = alloca i64, align 8, addrspace(5)
|
||||
%"46" = alloca i64, align 8, addrspace(5)
|
||||
%"47" = alloca float, align 4, addrspace(5)
|
||||
%"48" = alloca float, align 4, addrspace(5)
|
||||
%"49" = alloca float, align 4, addrspace(5)
|
||||
%"50" = alloca i1, align 1, addrspace(5)
|
||||
define amdgpu_kernel void @setp_gt(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 {
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
%"42" = alloca i64, align 8, addrspace(5)
|
||||
%"43" = alloca float, align 4, addrspace(5)
|
||||
%"44" = alloca float, align 4, addrspace(5)
|
||||
%"45" = alloca float, align 4, addrspace(5)
|
||||
%"46" = alloca i1, align 1, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"51" = load i64, ptr addrspace(4) %"43", align 4
|
||||
store i64 %"51", ptr addrspace(5) %"45", align 4
|
||||
%"52" = load i64, ptr addrspace(4) %"44", align 4
|
||||
store i64 %"52", ptr addrspace(5) %"46", align 4
|
||||
%"54" = load i64, ptr addrspace(5) %"45", align 4
|
||||
%"68" = inttoptr i64 %"54" to ptr
|
||||
%"53" = load float, ptr %"68", align 4
|
||||
store float %"53", ptr addrspace(5) %"47", align 4
|
||||
%"55" = load i64, ptr addrspace(5) %"45", align 4
|
||||
%"69" = inttoptr i64 %"55" to ptr
|
||||
%"36" = getelementptr inbounds i8, ptr %"69", i64 4
|
||||
%"56" = load float, ptr %"36", align 4
|
||||
store float %"56", ptr addrspace(5) %"48", align 4
|
||||
%"58" = load float, ptr addrspace(5) %"47", align 4
|
||||
%"59" = load float, ptr addrspace(5) %"48", align 4
|
||||
%"57" = fcmp ogt float %"58", %"59"
|
||||
store i1 %"57", ptr addrspace(5) %"50", align 1
|
||||
%"60" = load i1, ptr addrspace(5) %"50", align 1
|
||||
br i1 %"60", label %"15", label %"16"
|
||||
br label %"38"
|
||||
|
||||
"15": ; preds = %1
|
||||
%"62" = load float, ptr addrspace(5) %"47", align 4
|
||||
store float %"62", ptr addrspace(5) %"49", align 4
|
||||
br label %"16"
|
||||
"38": ; preds = %1
|
||||
%"47" = load i64, ptr addrspace(4) %"39", align 4
|
||||
store i64 %"47", ptr addrspace(5) %"41", align 4
|
||||
%"48" = load i64, ptr addrspace(4) %"40", align 4
|
||||
store i64 %"48", ptr addrspace(5) %"42", align 4
|
||||
%"50" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"64" = inttoptr i64 %"50" to ptr
|
||||
%"49" = load float, ptr %"64", align 4
|
||||
store float %"49", ptr addrspace(5) %"43", align 4
|
||||
%"51" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"65" = inttoptr i64 %"51" to ptr
|
||||
%"37" = getelementptr inbounds i8, ptr %"65", i64 4
|
||||
%"52" = load float, ptr %"37", align 4
|
||||
store float %"52", ptr addrspace(5) %"44", align 4
|
||||
%"54" = load float, ptr addrspace(5) %"43", align 4
|
||||
%"55" = load float, ptr addrspace(5) %"44", align 4
|
||||
%"53" = fcmp ogt float %"54", %"55"
|
||||
store i1 %"53", ptr addrspace(5) %"46", align 1
|
||||
%"56" = load i1, ptr addrspace(5) %"46", align 1
|
||||
br i1 %"56", label %"16", label %"17"
|
||||
|
||||
"16": ; preds = %"15", %1
|
||||
%"63" = load i1, ptr addrspace(5) %"50", align 1
|
||||
br i1 %"63", label %"18", label %"17"
|
||||
"16": ; preds = %"38"
|
||||
%"58" = load float, ptr addrspace(5) %"43", align 4
|
||||
store float %"58", ptr addrspace(5) %"45", align 4
|
||||
br label %"17"
|
||||
|
||||
"17": ; preds = %"16"
|
||||
%"65" = load float, ptr addrspace(5) %"48", align 4
|
||||
store float %"65", ptr addrspace(5) %"49", align 4
|
||||
br label %"18"
|
||||
"17": ; preds = %"16", %"38"
|
||||
%"59" = load i1, ptr addrspace(5) %"46", align 1
|
||||
br i1 %"59", label %"19", label %"18"
|
||||
|
||||
"18": ; preds = %"17", %"16"
|
||||
%"66" = load i64, ptr addrspace(5) %"46", align 4
|
||||
%"67" = load float, ptr addrspace(5) %"49", align 4
|
||||
%"70" = inttoptr i64 %"66" to ptr
|
||||
store float %"67", ptr %"70", align 4
|
||||
"18": ; preds = %"17"
|
||||
%"61" = load float, ptr addrspace(5) %"44", align 4
|
||||
store float %"61", ptr addrspace(5) %"45", align 4
|
||||
br label %"19"
|
||||
|
||||
"19": ; preds = %"18", %"17"
|
||||
%"62" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"63" = load float, ptr addrspace(5) %"45", align 4
|
||||
%"66" = inttoptr i64 %"62" to ptr
|
||||
store float %"63", ptr %"66", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,65 +1,56 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @setp_leu(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44") #0 {
|
||||
%"45" = alloca i64, align 8, addrspace(5)
|
||||
%"46" = alloca i64, align 8, addrspace(5)
|
||||
%"47" = alloca float, align 4, addrspace(5)
|
||||
%"48" = alloca float, align 4, addrspace(5)
|
||||
%"49" = alloca float, align 4, addrspace(5)
|
||||
%"50" = alloca i1, align 1, addrspace(5)
|
||||
define amdgpu_kernel void @setp_leu(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 {
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
%"42" = alloca i64, align 8, addrspace(5)
|
||||
%"43" = alloca float, align 4, addrspace(5)
|
||||
%"44" = alloca float, align 4, addrspace(5)
|
||||
%"45" = alloca float, align 4, addrspace(5)
|
||||
%"46" = alloca i1, align 1, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"51" = load i64, ptr addrspace(4) %"43", align 4
|
||||
store i64 %"51", ptr addrspace(5) %"45", align 4
|
||||
%"52" = load i64, ptr addrspace(4) %"44", align 4
|
||||
store i64 %"52", ptr addrspace(5) %"46", align 4
|
||||
%"54" = load i64, ptr addrspace(5) %"45", align 4
|
||||
%"68" = inttoptr i64 %"54" to ptr
|
||||
%"53" = load float, ptr %"68", align 4
|
||||
store float %"53", ptr addrspace(5) %"47", align 4
|
||||
%"55" = load i64, ptr addrspace(5) %"45", align 4
|
||||
%"69" = inttoptr i64 %"55" to ptr
|
||||
%"36" = getelementptr inbounds i8, ptr %"69", i64 4
|
||||
%"56" = load float, ptr %"36", align 4
|
||||
store float %"56", ptr addrspace(5) %"48", align 4
|
||||
%"58" = load float, ptr addrspace(5) %"47", align 4
|
||||
%"59" = load float, ptr addrspace(5) %"48", align 4
|
||||
%"57" = fcmp ule float %"58", %"59"
|
||||
store i1 %"57", ptr addrspace(5) %"50", align 1
|
||||
%"60" = load i1, ptr addrspace(5) %"50", align 1
|
||||
br i1 %"60", label %"15", label %"16"
|
||||
br label %"38"
|
||||
|
||||
"15": ; preds = %1
|
||||
%"62" = load float, ptr addrspace(5) %"47", align 4
|
||||
store float %"62", ptr addrspace(5) %"49", align 4
|
||||
br label %"16"
|
||||
"38": ; preds = %1
|
||||
%"47" = load i64, ptr addrspace(4) %"39", align 4
|
||||
store i64 %"47", ptr addrspace(5) %"41", align 4
|
||||
%"48" = load i64, ptr addrspace(4) %"40", align 4
|
||||
store i64 %"48", ptr addrspace(5) %"42", align 4
|
||||
%"50" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"64" = inttoptr i64 %"50" to ptr
|
||||
%"49" = load float, ptr %"64", align 4
|
||||
store float %"49", ptr addrspace(5) %"43", align 4
|
||||
%"51" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"65" = inttoptr i64 %"51" to ptr
|
||||
%"37" = getelementptr inbounds i8, ptr %"65", i64 4
|
||||
%"52" = load float, ptr %"37", align 4
|
||||
store float %"52", ptr addrspace(5) %"44", align 4
|
||||
%"54" = load float, ptr addrspace(5) %"43", align 4
|
||||
%"55" = load float, ptr addrspace(5) %"44", align 4
|
||||
%"53" = fcmp ule float %"54", %"55"
|
||||
store i1 %"53", ptr addrspace(5) %"46", align 1
|
||||
%"56" = load i1, ptr addrspace(5) %"46", align 1
|
||||
br i1 %"56", label %"16", label %"17"
|
||||
|
||||
"16": ; preds = %"15", %1
|
||||
%"63" = load i1, ptr addrspace(5) %"50", align 1
|
||||
br i1 %"63", label %"18", label %"17"
|
||||
"16": ; preds = %"38"
|
||||
%"58" = load float, ptr addrspace(5) %"43", align 4
|
||||
store float %"58", ptr addrspace(5) %"45", align 4
|
||||
br label %"17"
|
||||
|
||||
"17": ; preds = %"16"
|
||||
%"65" = load float, ptr addrspace(5) %"48", align 4
|
||||
store float %"65", ptr addrspace(5) %"49", align 4
|
||||
br label %"18"
|
||||
"17": ; preds = %"16", %"38"
|
||||
%"59" = load i1, ptr addrspace(5) %"46", align 1
|
||||
br i1 %"59", label %"19", label %"18"
|
||||
|
||||
"18": ; preds = %"17", %"16"
|
||||
%"66" = load i64, ptr addrspace(5) %"46", align 4
|
||||
%"67" = load float, ptr addrspace(5) %"49", align 4
|
||||
%"70" = inttoptr i64 %"66" to ptr
|
||||
store float %"67", ptr %"70", align 4
|
||||
"18": ; preds = %"17"
|
||||
%"61" = load float, ptr addrspace(5) %"44", align 4
|
||||
store float %"61", ptr addrspace(5) %"45", align 4
|
||||
br label %"19"
|
||||
|
||||
"19": ; preds = %"18", %"17"
|
||||
%"62" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"63" = load float, ptr addrspace(5) %"45", align 4
|
||||
%"66" = inttoptr i64 %"62" to ptr
|
||||
store float %"63", ptr %"66", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,174 +1,165 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @setp_nan(ptr addrspace(4) byref(i64) %"87", ptr addrspace(4) byref(i64) %"88") #0 {
|
||||
%"89" = alloca i64, align 8, addrspace(5)
|
||||
%"90" = alloca i64, align 8, addrspace(5)
|
||||
define amdgpu_kernel void @setp_nan(ptr addrspace(4) byref(i64) %"83", ptr addrspace(4) byref(i64) %"84") #0 {
|
||||
%"85" = alloca i64, align 8, addrspace(5)
|
||||
%"86" = alloca i64, align 8, addrspace(5)
|
||||
%"87" = alloca float, align 4, addrspace(5)
|
||||
%"88" = alloca float, align 4, addrspace(5)
|
||||
%"89" = alloca float, align 4, addrspace(5)
|
||||
%"90" = alloca float, align 4, addrspace(5)
|
||||
%"91" = alloca float, align 4, addrspace(5)
|
||||
%"92" = alloca float, align 4, addrspace(5)
|
||||
%"93" = alloca float, align 4, addrspace(5)
|
||||
%"94" = alloca float, align 4, addrspace(5)
|
||||
%"95" = alloca float, align 4, addrspace(5)
|
||||
%"96" = alloca float, align 4, addrspace(5)
|
||||
%"97" = alloca float, align 4, addrspace(5)
|
||||
%"98" = alloca float, align 4, addrspace(5)
|
||||
%"99" = alloca i32, align 4, addrspace(5)
|
||||
%"100" = alloca i1, align 1, addrspace(5)
|
||||
%"95" = alloca i32, align 4, addrspace(5)
|
||||
%"96" = alloca i1, align 1, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"101" = load i64, ptr addrspace(4) %"87", align 4
|
||||
store i64 %"101", ptr addrspace(5) %"89", align 4
|
||||
%"102" = load i64, ptr addrspace(4) %"88", align 4
|
||||
store i64 %"102", ptr addrspace(5) %"90", align 4
|
||||
%"104" = load i64, ptr addrspace(5) %"89", align 4
|
||||
%"155" = inttoptr i64 %"104" to ptr
|
||||
%"103" = load float, ptr %"155", align 4
|
||||
store float %"103", ptr addrspace(5) %"91", align 4
|
||||
%"105" = load i64, ptr addrspace(5) %"89", align 4
|
||||
%"156" = inttoptr i64 %"105" to ptr
|
||||
%"54" = getelementptr inbounds i8, ptr %"156", i64 4
|
||||
%"106" = load float, ptr %"54", align 4
|
||||
store float %"106", ptr addrspace(5) %"92", align 4
|
||||
%"107" = load i64, ptr addrspace(5) %"89", align 4
|
||||
%"157" = inttoptr i64 %"107" to ptr
|
||||
%"56" = getelementptr inbounds i8, ptr %"157", i64 8
|
||||
%"108" = load float, ptr %"56", align 4
|
||||
store float %"108", ptr addrspace(5) %"93", align 4
|
||||
%"109" = load i64, ptr addrspace(5) %"89", align 4
|
||||
%"158" = inttoptr i64 %"109" to ptr
|
||||
%"58" = getelementptr inbounds i8, ptr %"158", i64 12
|
||||
%"110" = load float, ptr %"58", align 4
|
||||
store float %"110", ptr addrspace(5) %"94", align 4
|
||||
%"111" = load i64, ptr addrspace(5) %"89", align 4
|
||||
%"159" = inttoptr i64 %"111" to ptr
|
||||
%"60" = getelementptr inbounds i8, ptr %"159", i64 16
|
||||
%"112" = load float, ptr %"60", align 4
|
||||
store float %"112", ptr addrspace(5) %"95", align 4
|
||||
%"113" = load i64, ptr addrspace(5) %"89", align 4
|
||||
%"160" = inttoptr i64 %"113" to ptr
|
||||
%"62" = getelementptr inbounds i8, ptr %"160", i64 20
|
||||
%"114" = load float, ptr %"62", align 4
|
||||
store float %"114", ptr addrspace(5) %"96", align 4
|
||||
%"115" = load i64, ptr addrspace(5) %"89", align 4
|
||||
%"161" = inttoptr i64 %"115" to ptr
|
||||
%"64" = getelementptr inbounds i8, ptr %"161", i64 24
|
||||
%"116" = load float, ptr %"64", align 4
|
||||
store float %"116", ptr addrspace(5) %"97", align 4
|
||||
%"117" = load i64, ptr addrspace(5) %"89", align 4
|
||||
%"162" = inttoptr i64 %"117" to ptr
|
||||
%"66" = getelementptr inbounds i8, ptr %"162", i64 28
|
||||
%"118" = load float, ptr %"66", align 4
|
||||
store float %"118", ptr addrspace(5) %"98", align 4
|
||||
%"120" = load float, ptr addrspace(5) %"91", align 4
|
||||
%"121" = load float, ptr addrspace(5) %"92", align 4
|
||||
%"119" = fcmp uno float %"120", %"121"
|
||||
store i1 %"119", ptr addrspace(5) %"100", align 1
|
||||
%"122" = load i1, ptr addrspace(5) %"100", align 1
|
||||
br i1 %"122", label %"21", label %"22"
|
||||
br label %"82"
|
||||
|
||||
"21": ; preds = %1
|
||||
store i32 1, ptr addrspace(5) %"99", align 4
|
||||
br label %"22"
|
||||
"82": ; preds = %1
|
||||
%"97" = load i64, ptr addrspace(4) %"83", align 4
|
||||
store i64 %"97", ptr addrspace(5) %"85", align 4
|
||||
%"98" = load i64, ptr addrspace(4) %"84", align 4
|
||||
store i64 %"98", ptr addrspace(5) %"86", align 4
|
||||
%"100" = load i64, ptr addrspace(5) %"85", align 4
|
||||
%"151" = inttoptr i64 %"100" to ptr
|
||||
%"99" = load float, ptr %"151", align 4
|
||||
store float %"99", ptr addrspace(5) %"87", align 4
|
||||
%"101" = load i64, ptr addrspace(5) %"85", align 4
|
||||
%"152" = inttoptr i64 %"101" to ptr
|
||||
%"55" = getelementptr inbounds i8, ptr %"152", i64 4
|
||||
%"102" = load float, ptr %"55", align 4
|
||||
store float %"102", ptr addrspace(5) %"88", align 4
|
||||
%"103" = load i64, ptr addrspace(5) %"85", align 4
|
||||
%"153" = inttoptr i64 %"103" to ptr
|
||||
%"57" = getelementptr inbounds i8, ptr %"153", i64 8
|
||||
%"104" = load float, ptr %"57", align 4
|
||||
store float %"104", ptr addrspace(5) %"89", align 4
|
||||
%"105" = load i64, ptr addrspace(5) %"85", align 4
|
||||
%"154" = inttoptr i64 %"105" to ptr
|
||||
%"59" = getelementptr inbounds i8, ptr %"154", i64 12
|
||||
%"106" = load float, ptr %"59", align 4
|
||||
store float %"106", ptr addrspace(5) %"90", align 4
|
||||
%"107" = load i64, ptr addrspace(5) %"85", align 4
|
||||
%"155" = inttoptr i64 %"107" to ptr
|
||||
%"61" = getelementptr inbounds i8, ptr %"155", i64 16
|
||||
%"108" = load float, ptr %"61", align 4
|
||||
store float %"108", ptr addrspace(5) %"91", align 4
|
||||
%"109" = load i64, ptr addrspace(5) %"85", align 4
|
||||
%"156" = inttoptr i64 %"109" to ptr
|
||||
%"63" = getelementptr inbounds i8, ptr %"156", i64 20
|
||||
%"110" = load float, ptr %"63", align 4
|
||||
store float %"110", ptr addrspace(5) %"92", align 4
|
||||
%"111" = load i64, ptr addrspace(5) %"85", align 4
|
||||
%"157" = inttoptr i64 %"111" to ptr
|
||||
%"65" = getelementptr inbounds i8, ptr %"157", i64 24
|
||||
%"112" = load float, ptr %"65", align 4
|
||||
store float %"112", ptr addrspace(5) %"93", align 4
|
||||
%"113" = load i64, ptr addrspace(5) %"85", align 4
|
||||
%"158" = inttoptr i64 %"113" to ptr
|
||||
%"67" = getelementptr inbounds i8, ptr %"158", i64 28
|
||||
%"114" = load float, ptr %"67", align 4
|
||||
store float %"114", ptr addrspace(5) %"94", align 4
|
||||
%"116" = load float, ptr addrspace(5) %"87", align 4
|
||||
%"117" = load float, ptr addrspace(5) %"88", align 4
|
||||
%"115" = fcmp uno float %"116", %"117"
|
||||
store i1 %"115", ptr addrspace(5) %"96", align 1
|
||||
%"118" = load i1, ptr addrspace(5) %"96", align 1
|
||||
br i1 %"118", label %"22", label %"23"
|
||||
|
||||
"22": ; preds = %"21", %1
|
||||
%"124" = load i1, ptr addrspace(5) %"100", align 1
|
||||
br i1 %"124", label %"24", label %"23"
|
||||
"22": ; preds = %"82"
|
||||
store i32 1, ptr addrspace(5) %"95", align 4
|
||||
br label %"23"
|
||||
|
||||
"23": ; preds = %"22"
|
||||
store i32 0, ptr addrspace(5) %"99", align 4
|
||||
br label %"24"
|
||||
"23": ; preds = %"22", %"82"
|
||||
%"120" = load i1, ptr addrspace(5) %"96", align 1
|
||||
br i1 %"120", label %"25", label %"24"
|
||||
|
||||
"24": ; preds = %"23", %"22"
|
||||
%"126" = load i64, ptr addrspace(5) %"90", align 4
|
||||
%"127" = load i32, ptr addrspace(5) %"99", align 4
|
||||
%"163" = inttoptr i64 %"126" to ptr
|
||||
store i32 %"127", ptr %"163", align 4
|
||||
%"129" = load float, ptr addrspace(5) %"93", align 4
|
||||
%"130" = load float, ptr addrspace(5) %"94", align 4
|
||||
%"128" = fcmp uno float %"129", %"130"
|
||||
store i1 %"128", ptr addrspace(5) %"100", align 1
|
||||
%"131" = load i1, ptr addrspace(5) %"100", align 1
|
||||
br i1 %"131", label %"25", label %"26"
|
||||
"24": ; preds = %"23"
|
||||
store i32 0, ptr addrspace(5) %"95", align 4
|
||||
br label %"25"
|
||||
|
||||
"25": ; preds = %"24"
|
||||
store i32 1, ptr addrspace(5) %"99", align 4
|
||||
br label %"26"
|
||||
"25": ; preds = %"24", %"23"
|
||||
%"122" = load i64, ptr addrspace(5) %"86", align 4
|
||||
%"123" = load i32, ptr addrspace(5) %"95", align 4
|
||||
%"159" = inttoptr i64 %"122" to ptr
|
||||
store i32 %"123", ptr %"159", align 4
|
||||
%"125" = load float, ptr addrspace(5) %"89", align 4
|
||||
%"126" = load float, ptr addrspace(5) %"90", align 4
|
||||
%"124" = fcmp uno float %"125", %"126"
|
||||
store i1 %"124", ptr addrspace(5) %"96", align 1
|
||||
%"127" = load i1, ptr addrspace(5) %"96", align 1
|
||||
br i1 %"127", label %"26", label %"27"
|
||||
|
||||
"26": ; preds = %"25", %"24"
|
||||
%"133" = load i1, ptr addrspace(5) %"100", align 1
|
||||
br i1 %"133", label %"28", label %"27"
|
||||
"26": ; preds = %"25"
|
||||
store i32 1, ptr addrspace(5) %"95", align 4
|
||||
br label %"27"
|
||||
|
||||
"27": ; preds = %"26"
|
||||
store i32 0, ptr addrspace(5) %"99", align 4
|
||||
br label %"28"
|
||||
"27": ; preds = %"26", %"25"
|
||||
%"129" = load i1, ptr addrspace(5) %"96", align 1
|
||||
br i1 %"129", label %"29", label %"28"
|
||||
|
||||
"28": ; preds = %"27", %"26"
|
||||
%"135" = load i64, ptr addrspace(5) %"90", align 4
|
||||
%"164" = inttoptr i64 %"135" to ptr
|
||||
%"72" = getelementptr inbounds i8, ptr %"164", i64 4
|
||||
%"136" = load i32, ptr addrspace(5) %"99", align 4
|
||||
store i32 %"136", ptr %"72", align 4
|
||||
%"138" = load float, ptr addrspace(5) %"95", align 4
|
||||
%"139" = load float, ptr addrspace(5) %"96", align 4
|
||||
%"137" = fcmp uno float %"138", %"139"
|
||||
store i1 %"137", ptr addrspace(5) %"100", align 1
|
||||
%"140" = load i1, ptr addrspace(5) %"100", align 1
|
||||
br i1 %"140", label %"29", label %"30"
|
||||
"28": ; preds = %"27"
|
||||
store i32 0, ptr addrspace(5) %"95", align 4
|
||||
br label %"29"
|
||||
|
||||
"29": ; preds = %"28"
|
||||
store i32 1, ptr addrspace(5) %"99", align 4
|
||||
br label %"30"
|
||||
"29": ; preds = %"28", %"27"
|
||||
%"131" = load i64, ptr addrspace(5) %"86", align 4
|
||||
%"160" = inttoptr i64 %"131" to ptr
|
||||
%"73" = getelementptr inbounds i8, ptr %"160", i64 4
|
||||
%"132" = load i32, ptr addrspace(5) %"95", align 4
|
||||
store i32 %"132", ptr %"73", align 4
|
||||
%"134" = load float, ptr addrspace(5) %"91", align 4
|
||||
%"135" = load float, ptr addrspace(5) %"92", align 4
|
||||
%"133" = fcmp uno float %"134", %"135"
|
||||
store i1 %"133", ptr addrspace(5) %"96", align 1
|
||||
%"136" = load i1, ptr addrspace(5) %"96", align 1
|
||||
br i1 %"136", label %"30", label %"31"
|
||||
|
||||
"30": ; preds = %"29", %"28"
|
||||
%"142" = load i1, ptr addrspace(5) %"100", align 1
|
||||
br i1 %"142", label %"32", label %"31"
|
||||
"30": ; preds = %"29"
|
||||
store i32 1, ptr addrspace(5) %"95", align 4
|
||||
br label %"31"
|
||||
|
||||
"31": ; preds = %"30"
|
||||
store i32 0, ptr addrspace(5) %"99", align 4
|
||||
br label %"32"
|
||||
"31": ; preds = %"30", %"29"
|
||||
%"138" = load i1, ptr addrspace(5) %"96", align 1
|
||||
br i1 %"138", label %"33", label %"32"
|
||||
|
||||
"32": ; preds = %"31", %"30"
|
||||
%"144" = load i64, ptr addrspace(5) %"90", align 4
|
||||
%"165" = inttoptr i64 %"144" to ptr
|
||||
%"76" = getelementptr inbounds i8, ptr %"165", i64 8
|
||||
%"145" = load i32, ptr addrspace(5) %"99", align 4
|
||||
store i32 %"145", ptr %"76", align 4
|
||||
%"147" = load float, ptr addrspace(5) %"97", align 4
|
||||
%"148" = load float, ptr addrspace(5) %"98", align 4
|
||||
%"146" = fcmp uno float %"147", %"148"
|
||||
store i1 %"146", ptr addrspace(5) %"100", align 1
|
||||
%"149" = load i1, ptr addrspace(5) %"100", align 1
|
||||
br i1 %"149", label %"33", label %"34"
|
||||
"32": ; preds = %"31"
|
||||
store i32 0, ptr addrspace(5) %"95", align 4
|
||||
br label %"33"
|
||||
|
||||
"33": ; preds = %"32"
|
||||
store i32 1, ptr addrspace(5) %"99", align 4
|
||||
br label %"34"
|
||||
"33": ; preds = %"32", %"31"
|
||||
%"140" = load i64, ptr addrspace(5) %"86", align 4
|
||||
%"161" = inttoptr i64 %"140" to ptr
|
||||
%"77" = getelementptr inbounds i8, ptr %"161", i64 8
|
||||
%"141" = load i32, ptr addrspace(5) %"95", align 4
|
||||
store i32 %"141", ptr %"77", align 4
|
||||
%"143" = load float, ptr addrspace(5) %"93", align 4
|
||||
%"144" = load float, ptr addrspace(5) %"94", align 4
|
||||
%"142" = fcmp uno float %"143", %"144"
|
||||
store i1 %"142", ptr addrspace(5) %"96", align 1
|
||||
%"145" = load i1, ptr addrspace(5) %"96", align 1
|
||||
br i1 %"145", label %"34", label %"35"
|
||||
|
||||
"34": ; preds = %"33", %"32"
|
||||
%"151" = load i1, ptr addrspace(5) %"100", align 1
|
||||
br i1 %"151", label %"36", label %"35"
|
||||
"34": ; preds = %"33"
|
||||
store i32 1, ptr addrspace(5) %"95", align 4
|
||||
br label %"35"
|
||||
|
||||
"35": ; preds = %"34"
|
||||
store i32 0, ptr addrspace(5) %"99", align 4
|
||||
br label %"36"
|
||||
"35": ; preds = %"34", %"33"
|
||||
%"147" = load i1, ptr addrspace(5) %"96", align 1
|
||||
br i1 %"147", label %"37", label %"36"
|
||||
|
||||
"36": ; preds = %"35", %"34"
|
||||
%"153" = load i64, ptr addrspace(5) %"90", align 4
|
||||
%"166" = inttoptr i64 %"153" to ptr
|
||||
%"80" = getelementptr inbounds i8, ptr %"166", i64 12
|
||||
%"154" = load i32, ptr addrspace(5) %"99", align 4
|
||||
store i32 %"154", ptr %"80", align 4
|
||||
"36": ; preds = %"35"
|
||||
store i32 0, ptr addrspace(5) %"95", align 4
|
||||
br label %"37"
|
||||
|
||||
"37": ; preds = %"36", %"35"
|
||||
%"149" = load i64, ptr addrspace(5) %"86", align 4
|
||||
%"162" = inttoptr i64 %"149" to ptr
|
||||
%"81" = getelementptr inbounds i8, ptr %"162", i64 12
|
||||
%"150" = load i32, ptr addrspace(5) %"95", align 4
|
||||
store i32 %"150", ptr %"81", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,174 +1,165 @@
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @setp_num(ptr addrspace(4) byref(i64) %"87", ptr addrspace(4) byref(i64) %"88") #0 {
|
||||
%"89" = alloca i64, align 8, addrspace(5)
|
||||
%"90" = alloca i64, align 8, addrspace(5)
|
||||
define amdgpu_kernel void @setp_num(ptr addrspace(4) byref(i64) %"83", ptr addrspace(4) byref(i64) %"84") #0 {
|
||||
%"85" = alloca i64, align 8, addrspace(5)
|
||||
%"86" = alloca i64, align 8, addrspace(5)
|
||||
%"87" = alloca float, align 4, addrspace(5)
|
||||
%"88" = alloca float, align 4, addrspace(5)
|
||||
%"89" = alloca float, align 4, addrspace(5)
|
||||
%"90" = alloca float, align 4, addrspace(5)
|
||||
%"91" = alloca float, align 4, addrspace(5)
|
||||
%"92" = alloca float, align 4, addrspace(5)
|
||||
%"93" = alloca float, align 4, addrspace(5)
|
||||
%"94" = alloca float, align 4, addrspace(5)
|
||||
%"95" = alloca float, align 4, addrspace(5)
|
||||
%"96" = alloca float, align 4, addrspace(5)
|
||||
%"97" = alloca float, align 4, addrspace(5)
|
||||
%"98" = alloca float, align 4, addrspace(5)
|
||||
%"99" = alloca i32, align 4, addrspace(5)
|
||||
%"100" = alloca i1, align 1, addrspace(5)
|
||||
%"95" = alloca i32, align 4, addrspace(5)
|
||||
%"96" = alloca i1, align 1, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"101" = load i64, ptr addrspace(4) %"87", align 4
|
||||
store i64 %"101", ptr addrspace(5) %"89", align 4
|
||||
%"102" = load i64, ptr addrspace(4) %"88", align 4
|
||||
store i64 %"102", ptr addrspace(5) %"90", align 4
|
||||
%"104" = load i64, ptr addrspace(5) %"89", align 4
|
||||
%"155" = inttoptr i64 %"104" to ptr
|
||||
%"103" = load float, ptr %"155", align 4
|
||||
store float %"103", ptr addrspace(5) %"91", align 4
|
||||
%"105" = load i64, ptr addrspace(5) %"89", align 4
|
||||
%"156" = inttoptr i64 %"105" to ptr
|
||||
%"54" = getelementptr inbounds i8, ptr %"156", i64 4
|
||||
%"106" = load float, ptr %"54", align 4
|
||||
store float %"106", ptr addrspace(5) %"92", align 4
|
||||
%"107" = load i64, ptr addrspace(5) %"89", align 4
|
||||
%"157" = inttoptr i64 %"107" to ptr
|
||||
%"56" = getelementptr inbounds i8, ptr %"157", i64 8
|
||||
%"108" = load float, ptr %"56", align 4
|
||||
store float %"108", ptr addrspace(5) %"93", align 4
|
||||
%"109" = load i64, ptr addrspace(5) %"89", align 4
|
||||
%"158" = inttoptr i64 %"109" to ptr
|
||||
%"58" = getelementptr inbounds i8, ptr %"158", i64 12
|
||||
%"110" = load float, ptr %"58", align 4
|
||||
store float %"110", ptr addrspace(5) %"94", align 4
|
||||
%"111" = load i64, ptr addrspace(5) %"89", align 4
|
||||
%"159" = inttoptr i64 %"111" to ptr
|
||||
%"60" = getelementptr inbounds i8, ptr %"159", i64 16
|
||||
%"112" = load float, ptr %"60", align 4
|
||||
store float %"112", ptr addrspace(5) %"95", align 4
|
||||
%"113" = load i64, ptr addrspace(5) %"89", align 4
|
||||
%"160" = inttoptr i64 %"113" to ptr
|
||||
%"62" = getelementptr inbounds i8, ptr %"160", i64 20
|
||||
%"114" = load float, ptr %"62", align 4
|
||||
store float %"114", ptr addrspace(5) %"96", align 4
|
||||
%"115" = load i64, ptr addrspace(5) %"89", align 4
|
||||
%"161" = inttoptr i64 %"115" to ptr
|
||||
%"64" = getelementptr inbounds i8, ptr %"161", i64 24
|
||||
%"116" = load float, ptr %"64", align 4
|
||||
store float %"116", ptr addrspace(5) %"97", align 4
|
||||
%"117" = load i64, ptr addrspace(5) %"89", align 4
|
||||
%"162" = inttoptr i64 %"117" to ptr
|
||||
%"66" = getelementptr inbounds i8, ptr %"162", i64 28
|
||||
%"118" = load float, ptr %"66", align 4
|
||||
store float %"118", ptr addrspace(5) %"98", align 4
|
||||
%"120" = load float, ptr addrspace(5) %"91", align 4
|
||||
%"121" = load float, ptr addrspace(5) %"92", align 4
|
||||
%"119" = fcmp ord float %"120", %"121"
|
||||
store i1 %"119", ptr addrspace(5) %"100", align 1
|
||||
%"122" = load i1, ptr addrspace(5) %"100", align 1
|
||||
br i1 %"122", label %"21", label %"22"
|
||||
br label %"82"
|
||||
|
||||
"21": ; preds = %1
|
||||
store i32 2, ptr addrspace(5) %"99", align 4
|
||||
br label %"22"
|
||||
"82": ; preds = %1
|
||||
%"97" = load i64, ptr addrspace(4) %"83", align 4
|
||||
store i64 %"97", ptr addrspace(5) %"85", align 4
|
||||
%"98" = load i64, ptr addrspace(4) %"84", align 4
|
||||
store i64 %"98", ptr addrspace(5) %"86", align 4
|
||||
%"100" = load i64, ptr addrspace(5) %"85", align 4
|
||||
%"151" = inttoptr i64 %"100" to ptr
|
||||
%"99" = load float, ptr %"151", align 4
|
||||
store float %"99", ptr addrspace(5) %"87", align 4
|
||||
%"101" = load i64, ptr addrspace(5) %"85", align 4
|
||||
%"152" = inttoptr i64 %"101" to ptr
|
||||
%"55" = getelementptr inbounds i8, ptr %"152", i64 4
|
||||
%"102" = load float, ptr %"55", align 4
|
||||
store float %"102", ptr addrspace(5) %"88", align 4
|
||||
%"103" = load i64, ptr addrspace(5) %"85", align 4
|
||||
%"153" = inttoptr i64 %"103" to ptr
|
||||
%"57" = getelementptr inbounds i8, ptr %"153", i64 8
|
||||
%"104" = load float, ptr %"57", align 4
|
||||
store float %"104", ptr addrspace(5) %"89", align 4
|
||||
%"105" = load i64, ptr addrspace(5) %"85", align 4
|
||||
%"154" = inttoptr i64 %"105" to ptr
|
||||
%"59" = getelementptr inbounds i8, ptr %"154", i64 12
|
||||
%"106" = load float, ptr %"59", align 4
|
||||
store float %"106", ptr addrspace(5) %"90", align 4
|
||||
%"107" = load i64, ptr addrspace(5) %"85", align 4
|
||||
%"155" = inttoptr i64 %"107" to ptr
|
||||
%"61" = getelementptr inbounds i8, ptr %"155", i64 16
|
||||
%"108" = load float, ptr %"61", align 4
|
||||
store float %"108", ptr addrspace(5) %"91", align 4
|
||||
%"109" = load i64, ptr addrspace(5) %"85", align 4
|
||||
%"156" = inttoptr i64 %"109" to ptr
|
||||
%"63" = getelementptr inbounds i8, ptr %"156", i64 20
|
||||
%"110" = load float, ptr %"63", align 4
|
||||
store float %"110", ptr addrspace(5) %"92", align 4
|
||||
%"111" = load i64, ptr addrspace(5) %"85", align 4
|
||||
%"157" = inttoptr i64 %"111" to ptr
|
||||
%"65" = getelementptr inbounds i8, ptr %"157", i64 24
|
||||
%"112" = load float, ptr %"65", align 4
|
||||
store float %"112", ptr addrspace(5) %"93", align 4
|
||||
%"113" = load i64, ptr addrspace(5) %"85", align 4
|
||||
%"158" = inttoptr i64 %"113" to ptr
|
||||
%"67" = getelementptr inbounds i8, ptr %"158", i64 28
|
||||
%"114" = load float, ptr %"67", align 4
|
||||
store float %"114", ptr addrspace(5) %"94", align 4
|
||||
%"116" = load float, ptr addrspace(5) %"87", align 4
|
||||
%"117" = load float, ptr addrspace(5) %"88", align 4
|
||||
%"115" = fcmp ord float %"116", %"117"
|
||||
store i1 %"115", ptr addrspace(5) %"96", align 1
|
||||
%"118" = load i1, ptr addrspace(5) %"96", align 1
|
||||
br i1 %"118", label %"22", label %"23"
|
||||
|
||||
"22": ; preds = %"21", %1
|
||||
%"124" = load i1, ptr addrspace(5) %"100", align 1
|
||||
br i1 %"124", label %"24", label %"23"
|
||||
"22": ; preds = %"82"
|
||||
store i32 2, ptr addrspace(5) %"95", align 4
|
||||
br label %"23"
|
||||
|
||||
"23": ; preds = %"22"
|
||||
store i32 0, ptr addrspace(5) %"99", align 4
|
||||
br label %"24"
|
||||
"23": ; preds = %"22", %"82"
|
||||
%"120" = load i1, ptr addrspace(5) %"96", align 1
|
||||
br i1 %"120", label %"25", label %"24"
|
||||
|
||||
"24": ; preds = %"23", %"22"
|
||||
%"126" = load i64, ptr addrspace(5) %"90", align 4
|
||||
%"127" = load i32, ptr addrspace(5) %"99", align 4
|
||||
%"163" = inttoptr i64 %"126" to ptr
|
||||
store i32 %"127", ptr %"163", align 4
|
||||
%"129" = load float, ptr addrspace(5) %"93", align 4
|
||||
%"130" = load float, ptr addrspace(5) %"94", align 4
|
||||
%"128" = fcmp ord float %"129", %"130"
|
||||
store i1 %"128", ptr addrspace(5) %"100", align 1
|
||||
%"131" = load i1, ptr addrspace(5) %"100", align 1
|
||||
br i1 %"131", label %"25", label %"26"
|
||||
"24": ; preds = %"23"
|
||||
store i32 0, ptr addrspace(5) %"95", align 4
|
||||
br label %"25"
|
||||
|
||||
"25": ; preds = %"24"
|
||||
store i32 2, ptr addrspace(5) %"99", align 4
|
||||
br label %"26"
|
||||
"25": ; preds = %"24", %"23"
|
||||
%"122" = load i64, ptr addrspace(5) %"86", align 4
|
||||
%"123" = load i32, ptr addrspace(5) %"95", align 4
|
||||
%"159" = inttoptr i64 %"122" to ptr
|
||||
store i32 %"123", ptr %"159", align 4
|
||||
%"125" = load float, ptr addrspace(5) %"89", align 4
|
||||
%"126" = load float, ptr addrspace(5) %"90", align 4
|
||||
%"124" = fcmp ord float %"125", %"126"
|
||||
store i1 %"124", ptr addrspace(5) %"96", align 1
|
||||
%"127" = load i1, ptr addrspace(5) %"96", align 1
|
||||
br i1 %"127", label %"26", label %"27"
|
||||
|
||||
"26": ; preds = %"25", %"24"
|
||||
%"133" = load i1, ptr addrspace(5) %"100", align 1
|
||||
br i1 %"133", label %"28", label %"27"
|
||||
"26": ; preds = %"25"
|
||||
store i32 2, ptr addrspace(5) %"95", align 4
|
||||
br label %"27"
|
||||
|
||||
"27": ; preds = %"26"
|
||||
store i32 0, ptr addrspace(5) %"99", align 4
|
||||
br label %"28"
|
||||
"27": ; preds = %"26", %"25"
|
||||
%"129" = load i1, ptr addrspace(5) %"96", align 1
|
||||
br i1 %"129", label %"29", label %"28"
|
||||
|
||||
"28": ; preds = %"27", %"26"
|
||||
%"135" = load i64, ptr addrspace(5) %"90", align 4
|
||||
%"164" = inttoptr i64 %"135" to ptr
|
||||
%"72" = getelementptr inbounds i8, ptr %"164", i64 4
|
||||
%"136" = load i32, ptr addrspace(5) %"99", align 4
|
||||
store i32 %"136", ptr %"72", align 4
|
||||
%"138" = load float, ptr addrspace(5) %"95", align 4
|
||||
%"139" = load float, ptr addrspace(5) %"96", align 4
|
||||
%"137" = fcmp ord float %"138", %"139"
|
||||
store i1 %"137", ptr addrspace(5) %"100", align 1
|
||||
%"140" = load i1, ptr addrspace(5) %"100", align 1
|
||||
br i1 %"140", label %"29", label %"30"
|
||||
"28": ; preds = %"27"
|
||||
store i32 0, ptr addrspace(5) %"95", align 4
|
||||
br label %"29"
|
||||
|
||||
"29": ; preds = %"28"
|
||||
store i32 2, ptr addrspace(5) %"99", align 4
|
||||
br label %"30"
|
||||
"29": ; preds = %"28", %"27"
|
||||
%"131" = load i64, ptr addrspace(5) %"86", align 4
|
||||
%"160" = inttoptr i64 %"131" to ptr
|
||||
%"73" = getelementptr inbounds i8, ptr %"160", i64 4
|
||||
%"132" = load i32, ptr addrspace(5) %"95", align 4
|
||||
store i32 %"132", ptr %"73", align 4
|
||||
%"134" = load float, ptr addrspace(5) %"91", align 4
|
||||
%"135" = load float, ptr addrspace(5) %"92", align 4
|
||||
%"133" = fcmp ord float %"134", %"135"
|
||||
store i1 %"133", ptr addrspace(5) %"96", align 1
|
||||
%"136" = load i1, ptr addrspace(5) %"96", align 1
|
||||
br i1 %"136", label %"30", label %"31"
|
||||
|
||||
"30": ; preds = %"29", %"28"
|
||||
%"142" = load i1, ptr addrspace(5) %"100", align 1
|
||||
br i1 %"142", label %"32", label %"31"
|
||||
"30": ; preds = %"29"
|
||||
store i32 2, ptr addrspace(5) %"95", align 4
|
||||
br label %"31"
|
||||
|
||||
"31": ; preds = %"30"
|
||||
store i32 0, ptr addrspace(5) %"99", align 4
|
||||
br label %"32"
|
||||
"31": ; preds = %"30", %"29"
|
||||
%"138" = load i1, ptr addrspace(5) %"96", align 1
|
||||
br i1 %"138", label %"33", label %"32"
|
||||
|
||||
"32": ; preds = %"31", %"30"
|
||||
%"144" = load i64, ptr addrspace(5) %"90", align 4
|
||||
%"165" = inttoptr i64 %"144" to ptr
|
||||
%"76" = getelementptr inbounds i8, ptr %"165", i64 8
|
||||
%"145" = load i32, ptr addrspace(5) %"99", align 4
|
||||
store i32 %"145", ptr %"76", align 4
|
||||
%"147" = load float, ptr addrspace(5) %"97", align 4
|
||||
%"148" = load float, ptr addrspace(5) %"98", align 4
|
||||
%"146" = fcmp ord float %"147", %"148"
|
||||
store i1 %"146", ptr addrspace(5) %"100", align 1
|
||||
%"149" = load i1, ptr addrspace(5) %"100", align 1
|
||||
br i1 %"149", label %"33", label %"34"
|
||||
"32": ; preds = %"31"
|
||||
store i32 0, ptr addrspace(5) %"95", align 4
|
||||
br label %"33"
|
||||
|
||||
"33": ; preds = %"32"
|
||||
store i32 2, ptr addrspace(5) %"99", align 4
|
||||
br label %"34"
|
||||
"33": ; preds = %"32", %"31"
|
||||
%"140" = load i64, ptr addrspace(5) %"86", align 4
|
||||
%"161" = inttoptr i64 %"140" to ptr
|
||||
%"77" = getelementptr inbounds i8, ptr %"161", i64 8
|
||||
%"141" = load i32, ptr addrspace(5) %"95", align 4
|
||||
store i32 %"141", ptr %"77", align 4
|
||||
%"143" = load float, ptr addrspace(5) %"93", align 4
|
||||
%"144" = load float, ptr addrspace(5) %"94", align 4
|
||||
%"142" = fcmp ord float %"143", %"144"
|
||||
store i1 %"142", ptr addrspace(5) %"96", align 1
|
||||
%"145" = load i1, ptr addrspace(5) %"96", align 1
|
||||
br i1 %"145", label %"34", label %"35"
|
||||
|
||||
"34": ; preds = %"33", %"32"
|
||||
%"151" = load i1, ptr addrspace(5) %"100", align 1
|
||||
br i1 %"151", label %"36", label %"35"
|
||||
"34": ; preds = %"33"
|
||||
store i32 2, ptr addrspace(5) %"95", align 4
|
||||
br label %"35"
|
||||
|
||||
"35": ; preds = %"34"
|
||||
store i32 0, ptr addrspace(5) %"99", align 4
|
||||
br label %"36"
|
||||
"35": ; preds = %"34", %"33"
|
||||
%"147" = load i1, ptr addrspace(5) %"96", align 1
|
||||
br i1 %"147", label %"37", label %"36"
|
||||
|
||||
"36": ; preds = %"35", %"34"
|
||||
%"153" = load i64, ptr addrspace(5) %"90", align 4
|
||||
%"166" = inttoptr i64 %"153" to ptr
|
||||
%"80" = getelementptr inbounds i8, ptr %"166", i64 12
|
||||
%"154" = load i32, ptr addrspace(5) %"99", align 4
|
||||
store i32 %"154", ptr %"80", align 4
|
||||
"36": ; preds = %"35"
|
||||
store i32 0, ptr addrspace(5) %"95", align 4
|
||||
br label %"37"
|
||||
|
||||
"37": ; preds = %"36", %"35"
|
||||
%"149" = load i64, ptr addrspace(5) %"86", align 4
|
||||
%"162" = inttoptr i64 %"149" to ptr
|
||||
%"81" = getelementptr inbounds i8, ptr %"162", i64 12
|
||||
%"150" = load i32, ptr addrspace(5) %"95", align 4
|
||||
store i32 %"150", ptr %"81", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,49 +1,40 @@
|
||||
@shared_mem1 = external addrspace(3) global [128 x i8], align 4
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @shared_ptr_32(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 {
|
||||
define amdgpu_kernel void @shared_ptr_32(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i32, align 4, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
%"42" = alloca i64, align 8, addrspace(5)
|
||||
%"43" = alloca i32, align 4, addrspace(5)
|
||||
%"44" = alloca i64, align 8, addrspace(5)
|
||||
%"45" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"46" = load i64, ptr addrspace(4) %"39", align 4
|
||||
store i64 %"46", ptr addrspace(5) %"41", align 4
|
||||
%"47" = load i64, ptr addrspace(4) %"40", align 4
|
||||
store i64 %"47", ptr addrspace(5) %"42", align 4
|
||||
store i32 ptrtoint (ptr addrspace(3) @shared_mem1 to i32), ptr addrspace(5) %"43", align 4
|
||||
%"50" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"58" = inttoptr i64 %"50" to ptr addrspace(1)
|
||||
%"49" = load i64, ptr addrspace(1) %"58", align 4
|
||||
store i64 %"49", ptr addrspace(5) %"44", align 4
|
||||
%"51" = load i32, ptr addrspace(5) %"43", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"44", align 4
|
||||
%"59" = inttoptr i32 %"51" to ptr addrspace(3)
|
||||
store i64 %"52", ptr addrspace(3) %"59", align 4
|
||||
%"53" = load i32, ptr addrspace(5) %"43", align 4
|
||||
%"60" = inttoptr i32 %"53" to ptr addrspace(3)
|
||||
%"32" = getelementptr inbounds i8, ptr addrspace(3) %"60", i64 0
|
||||
%"54" = load i64, ptr addrspace(3) %"32", align 4
|
||||
store i64 %"54", ptr addrspace(5) %"45", align 4
|
||||
%"55" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"56" = load i64, ptr addrspace(5) %"45", align 4
|
||||
%"61" = inttoptr i64 %"55" to ptr addrspace(1)
|
||||
store i64 %"56", ptr addrspace(1) %"61", align 4
|
||||
br label %"34"
|
||||
|
||||
"34": ; preds = %1
|
||||
%"42" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"37", align 4
|
||||
%"43" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"38", align 4
|
||||
store i32 ptrtoint (ptr addrspace(3) @shared_mem1 to i32), ptr addrspace(5) %"39", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"54" = inttoptr i64 %"46" to ptr addrspace(1)
|
||||
%"45" = load i64, ptr addrspace(1) %"54", align 4
|
||||
store i64 %"45", ptr addrspace(5) %"40", align 4
|
||||
%"47" = load i32, ptr addrspace(5) %"39", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"55" = inttoptr i32 %"47" to ptr addrspace(3)
|
||||
store i64 %"48", ptr addrspace(3) %"55", align 4
|
||||
%"49" = load i32, ptr addrspace(5) %"39", align 4
|
||||
%"56" = inttoptr i32 %"49" to ptr addrspace(3)
|
||||
%"33" = getelementptr inbounds i8, ptr addrspace(3) %"56", i64 0
|
||||
%"50" = load i64, ptr addrspace(3) %"33", align 4
|
||||
store i64 %"50", ptr addrspace(5) %"41", align 4
|
||||
%"51" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"57" = inttoptr i64 %"51" to ptr addrspace(1)
|
||||
store i64 %"52", ptr addrspace(1) %"57", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,48 +1,39 @@
|
||||
@shared_mem = external addrspace(3) global [0 x i8], align 4
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @shared_ptr_take_address(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
|
||||
define amdgpu_kernel void @shared_ptr_take_address(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
%"42" = alloca i64, align 8, addrspace(5)
|
||||
%"43" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
%"44" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"39", align 4
|
||||
%"45" = load i64, ptr addrspace(4) %"38", align 4
|
||||
store i64 %"45", ptr addrspace(5) %"40", align 4
|
||||
store i64 ptrtoint (ptr addrspace(3) @shared_mem to i64), ptr addrspace(5) %"41", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"56" = inttoptr i64 %"48" to ptr addrspace(1)
|
||||
%"47" = load i64, ptr addrspace(1) %"56", align 4
|
||||
store i64 %"47", ptr addrspace(5) %"42", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"50" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"57" = inttoptr i64 %"49" to ptr addrspace(3)
|
||||
store i64 %"50", ptr addrspace(3) %"57", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"58" = inttoptr i64 %"52" to ptr addrspace(3)
|
||||
%"51" = load i64, ptr addrspace(3) %"58", align 4
|
||||
store i64 %"51", ptr addrspace(5) %"43", align 4
|
||||
%"53" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"54" = load i64, ptr addrspace(5) %"43", align 4
|
||||
%"59" = inttoptr i64 %"53" to ptr addrspace(1)
|
||||
store i64 %"54", ptr addrspace(1) %"59", align 4
|
||||
br label %"32"
|
||||
|
||||
"32": ; preds = %1
|
||||
%"40" = load i64, ptr addrspace(4) %"33", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"35", align 4
|
||||
%"41" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"36", align 4
|
||||
store i64 ptrtoint (ptr addrspace(3) @shared_mem to i64), ptr addrspace(5) %"37", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"52" = inttoptr i64 %"44" to ptr addrspace(1)
|
||||
%"43" = load i64, ptr addrspace(1) %"52", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"38", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"53" = inttoptr i64 %"45" to ptr addrspace(3)
|
||||
store i64 %"46", ptr addrspace(3) %"53", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"54" = inttoptr i64 %"48" to ptr addrspace(3)
|
||||
%"47" = load i64, ptr addrspace(3) %"54", align 4
|
||||
store i64 %"47", ptr addrspace(5) %"39", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"50" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"55" = inttoptr i64 %"49" to ptr addrspace(1)
|
||||
store i64 %"50", ptr addrspace(1) %"55", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
@ -1,25 +1,16 @@
|
||||
@shared_ex = external addrspace(3) global [0 x i32]
|
||||
@shared_mod = external addrspace(3) global [4 x i32]
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define i64 @__zluda_ptx_impl_add() #0 {
|
||||
define i64 @add() #0 {
|
||||
%"46" = alloca i64, align 8, addrspace(5)
|
||||
%"47" = alloca i64, align 8, addrspace(5)
|
||||
%"48" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"41"
|
||||
|
||||
"41": ; preds = %1
|
||||
%"49" = load i64, ptr addrspace(3) @shared_mod, align 4
|
||||
store i64 %"49", ptr addrspace(5) %"47", align 4
|
||||
%"50" = load i64, ptr addrspace(3) @shared_ex, align 4
|
||||
@ -32,19 +23,25 @@ define i64 @__zluda_ptx_impl_add() #0 {
|
||||
ret i64 %2
|
||||
}
|
||||
|
||||
define i64 @__zluda_ptx_impl_set_shared_temp1(i64 %"15") #0 {
|
||||
define i64 @set_shared_temp1(i64 %"15") #0 {
|
||||
%"54" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"42"
|
||||
|
||||
"42": ; preds = %1
|
||||
store i64 %"15", ptr addrspace(3) @shared_ex, align 4
|
||||
%"55" = call i64 @__zluda_ptx_impl_add()
|
||||
%"55" = call i64 @add()
|
||||
store i64 %"55", ptr addrspace(5) %"54", align 4
|
||||
br label %"43"
|
||||
|
||||
"43": ; preds = %"42"
|
||||
%2 = load i64, ptr addrspace(5) %"54", align 4
|
||||
ret i64 %2
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @shared_unify_extern(ptr addrspace(4) byref(i64) %"56", ptr addrspace(4) byref(i64) %"57") #0 {
|
||||
define amdgpu_kernel void @shared_unify_extern(ptr addrspace(4) byref(i64) %"56", ptr addrspace(4) byref(i64) %"57") #1 {
|
||||
%"58" = alloca i64, align 8, addrspace(5)
|
||||
%"59" = alloca i64, align 8, addrspace(5)
|
||||
%"60" = alloca i64, align 8, addrspace(5)
|
||||
@ -52,6 +49,9 @@ define amdgpu_kernel void @shared_unify_extern(ptr addrspace(4) byref(i64) %"56"
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"44"
|
||||
|
||||
"44": ; preds = %1
|
||||
%"62" = load i64, ptr addrspace(4) %"56", align 4
|
||||
store i64 %"62", ptr addrspace(5) %"58", align 4
|
||||
%"63" = load i64, ptr addrspace(4) %"57", align 4
|
||||
@ -62,14 +62,17 @@ define amdgpu_kernel void @shared_unify_extern(ptr addrspace(4) byref(i64) %"56"
|
||||
store i64 %"64", ptr addrspace(5) %"60", align 4
|
||||
%"66" = load i64, ptr addrspace(5) %"58", align 4
|
||||
%"79" = inttoptr i64 %"66" to ptr addrspace(1)
|
||||
%"39" = getelementptr inbounds i8, ptr addrspace(1) %"79", i64 8
|
||||
%"67" = load i64, ptr addrspace(1) %"39", align 4
|
||||
%"40" = getelementptr inbounds i8, ptr addrspace(1) %"79", i64 8
|
||||
%"67" = load i64, ptr addrspace(1) %"40", align 4
|
||||
store i64 %"67", ptr addrspace(5) %"61", align 4
|
||||
%"68" = load i64, ptr addrspace(5) %"61", align 4
|
||||
store i64 %"68", ptr addrspace(3) @shared_mod, align 4
|
||||
%"70" = load i64, ptr addrspace(5) %"60", align 4
|
||||
%"81" = call i64 @__zluda_ptx_impl_set_shared_temp1(i64 %"70")
|
||||
%"81" = call i64 @set_shared_temp1(i64 %"70")
|
||||
store i64 %"81", ptr addrspace(5) %"61", align 4
|
||||
br label %"45"
|
||||
|
||||
"45": ; preds = %"44"
|
||||
%"71" = load i64, ptr addrspace(5) %"59", align 4
|
||||
%"72" = load i64, ptr addrspace(5) %"61", align 4
|
||||
%"83" = inttoptr i64 %"71" to ptr
|
||||
@ -77,4 +80,5 @@ define amdgpu_kernel void @shared_unify_extern(ptr addrspace(4) byref(i64) %"56"
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user