Compare commits

...

10 Commits
v4 ... master

Author SHA1 Message Date
Andrzej Janik
adc4673a20 Explicitly fail compilation on ROCm 6.4 (#361)
AMD broke comgr ABI in 6.4. This is a temporary solution.
2025-04-20 17:02:05 +02:00
Joëlle van Essen
7cdab7abc2 Implement mul24 (#351) 2025-04-08 12:27:19 +02:00
Andrzej Janik
d704e92c97 Support instruction modes (denormal and rounding) on AMD GPUs (#342) 2025-03-17 21:37:26 +01:00
Joëlle van Essen
867e4728d5 LLVM unit tests (#324)
* LLVM unit tests: add assembly files

* LLVM unit tests: first attempt

* LLVM unit tests: fix - parse bitcode in context

* LLVM unit tests: use pretty_assertions for line-by-line diff

* LLVM unit tests: Write IR to file for failed test

* LLVM unit tests: just use the stack

* LLVM unit tests: use MaybeUninit

* LLVM unit tests: add mul24.ll

* LLVM unit tests: Adjustments after review

* LLVM unit tests: Include emit_llvm::Context in emit_llvm::Module

* LLVM unit tests: Fix typo

* LLVM unit tests: Context need not be pub
2025-02-19 21:21:20 +01:00
Andrzej Janik
646d746e02 Start working on mul24 2025-02-07 19:37:11 +00:00
Andrzej Janik
df5a96d935 Improve build system (#329)
Also fix Dockerfile and Windows build
2025-01-28 01:55:36 +01:00
Alexander Zaitsev
9c0747a5f7 fix: missing inherits in a release-lto profile (#319) 2025-01-03 16:58:19 +01:00
Alexander Zaitsev
fee20e54d9 feat: enable LTO and codegen-units = 1 optimization (#318) 2025-01-02 19:07:39 +01:00
Joëlle van Essen
7399132d5d Fix test in zluda_dump (#316) 2025-01-01 23:02:59 +01:00
Andrzej Janik
ecd61a8e2a Update README for version 4 (#315) 2024-12-31 17:33:59 +01:00
149 changed files with 9859 additions and 796 deletions

View File

@ -0,0 +1,2 @@
[alias]
xtask = "run --package xtask --"

View File

@ -32,7 +32,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get update -y && DEBIAN_FRONTEND=noninter
cuda-profiler-api-${CUDA_PKG_VERSION} \ cuda-profiler-api-${CUDA_PKG_VERSION} \
cuda-nvcc-${CUDA_PKG_VERSION} cuda-nvcc-${CUDA_PKG_VERSION}
ARG ROCM_VERSION=6.2.2 ARG ROCM_VERSION=6.3.1
RUN mkdir --parents --mode=0755 /etc/apt/keyrings && \ RUN mkdir --parents --mode=0755 /etc/apt/keyrings && \
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \ wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
gpg --dearmor | tee /etc/apt/keyrings/rocm.gpg > /dev/null && \ gpg --dearmor | tee /etc/apt/keyrings/rocm.gpg > /dev/null && \
@ -43,7 +43,7 @@ RUN mkdir --parents --mode=0755 /etc/apt/keyrings && \
rocm-gdb \ rocm-gdb \
rocm-smi-lib \ rocm-smi-lib \
rocm-llvm-dev \ rocm-llvm-dev \
hip-runtime-amd && \ hip-runtime-amd \
hip-dev && \ hip-dev && \
echo '/opt/rocm/lib' > /etc/ld.so.conf.d/rocm.conf && \ echo '/opt/rocm/lib' > /etc/ld.so.conf.d/rocm.conf && \
ldconfig ldconfig

View File

@ -1,61 +0,0 @@
# Dependencies
Development builds of ZLUDA requires following dependencies:
* CMake
* Python 3
Additionally the repository has to be cloned with Git submodules initalized. If you cloned the repo without initalizing submodules, do this:
```
git submodule update --init --recursive
```
# Tests
Tests should be executed with `--workspace` option to test non-default targets:
```
cargo test --workspace
```
# Debugging
## Debuggging CUDA applications
When running an application with ZLUDA quite often you will run into subtle bugs or incompatibilities in the generated GPU code. The best way to debug an application's GPU CUDA code is to use ZLUDA dumper.
Library `zluda_dump` can be injected into a CUDA application and produce a trace which, for every launched GPU function contains:
* PTX source
* Launch arguments (block size, grid size, shared memory size)
* Dump of function arguments. Both after and before
Example use with GeekBench:
```
set ZLUDA_DUMP_KERNEL=knn_match
set ZLUDA_DUMP_DIR=C:\temp\zluda_dump
"<ZLUDA_PATH>\zluda_with.exe" "<ZLUDA_PATH>\zluda_dump.dll" -- "geekbench_x86_64.exe" --compute CUDA
```
The example above, for every execution of GPU function `knn_match`, will save its details into the directory `C:\temp\zluda_dump`
This dump can be replayed with `replay.py` script from `zluda_dump` source directory. Use it like this:
```
python replay.py "C:\temp\zluda_dump\geekbench_x86_64.exe"
```
You must copy (or symlink) ZLUDA `nvcuda.dll` into PyCUDA directory, so it will run using ZLUDA. Example output:
```
Intel(R) Graphics [0x3e92] [github.com/vosen/ZLUDA]
C:\temp\zluda_dump\geekbench_x86_64.exe\4140_scale_pyramid
C:\temp\zluda_dump\geekbench_x86_64.exe\4345_convolve_1d_vertical_grayscale
Skipping, launch block size (512) bigger than maximum block size (256)
C:\temp\zluda_dump\geekbench_x86_64.exe\4480_scale_pyramid
6:
Arrays are not equal
Mismatched elements: 1200 / 19989588 (0.006%)
Max absolute difference: 255
Max relative difference: 255.
x: array([ 7, 6, 8, ..., 193, 195, 193], dtype=uint8)
y: array([ 7, 6, 8, ..., 193, 195, 193], dtype=uint8)
```
From this output one can observe that in kernel launch 4480, 6th argument to function `scale_pyramid` differs between what was executed on an NVIDIA GPU using CUDA and Intel GPU using ZLUDA.
__Important__: It's impossible to infer what was the type (and semantics) of argument passed to a GPU function. At our level it's a buffer of bytes and by default `replay.py` simply checks if two buffers are byte-equal. That means you will have a ton of false negatives when running `replay.py`. You should override them for your particular case in `replay.py` - it already contains some overrides for GeekBench kernels

1395
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -18,7 +18,16 @@ members = [
"ptx_parser", "ptx_parser",
"ptx_parser_macros", "ptx_parser_macros",
"ptx_parser_macros_impl", "ptx_parser_macros_impl",
"xtask",
"zluda_bindgen", "zluda_bindgen",
] ]
default-members = ["zluda", "zluda_ml", "zluda_inject", "zluda_redirect"] default-members = ["zluda", "zluda_ml", "zluda_inject", "zluda_redirect"]
[profile.release-lto]
inherits = "release"
codegen-units = 1
lto = true
[profile.dev.package.xtask]
opt-level = 2

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 259 KiB

View File

@ -4,18 +4,23 @@
ZLUDA is a drop-in replacement for CUDA on non-NVIDIA GPU. ZLUDA allows to run unmodified CUDA applications using non-NVIDIA GPUs with near-native performance. ZLUDA is a drop-in replacement for CUDA on non-NVIDIA GPU. ZLUDA allows to run unmodified CUDA applications using non-NVIDIA GPUs with near-native performance.
ZLUDA supports AMD Radeon RX 5000 series and newer GPUs (both desktop and integrated).
![GeekBench 5.5.1 chart](geekbench.svg)
ZLUDA is work in progress. Follow development here and say hi on [Discord](https://discord.gg/sg6BNzXuc7). For more details see the announcement: https://vosen.github.io/ZLUDA/blog/zludas-third-life/ ZLUDA is work in progress. Follow development here and say hi on [Discord](https://discord.gg/sg6BNzXuc7). For more details see the announcement: https://vosen.github.io/ZLUDA/blog/zludas-third-life/
## Usage ## Usage
**Warning**: ZLUDA is under heavy development (see news [here](https://vosen.github.io/ZLUDA/blog/zludas-third-life/)). Instructions below might not work. **Warning**: This version ZLUDA is under heavy development (more [here](https://vosen.github.io/ZLUDA/blog/zludas-third-life/)) and right now only supports Geekbench. ZLUDA probably will not work with your application just yet.
### Windows ### Windows
You should have the most recent ROCm installed.\ You should have recent AMD GPU driver ("AMD Software: Adrenalin Edition") installed.\
Run your application like this: To run your application you should etiher:
``` * (Recommended approach) Copy ZLUDA-provided `nvcuda.dll` and `nvml.dll` from `target\release` (if built from sources) or `zluda` (if downloaded a zip package) into a path which your application uses to load CUDA. Paths vary application to application, but usually it's the directory where the .exe file is located
<ZLUDA_DIRECTORY>\zluda_with.exe -- <APPLICATION> <APPLICATIONS_ARGUMENTS> * Use ZLUDA launcher like below. ZLUDA launcher is known to be buggy and incomplete:
``` ```
<ZLUDA_DIRECTORY>\zluda_with.exe -- <APPLICATION> <APPLICATIONS_ARGUMENTS>
```
### Linux ### Linux
@ -24,33 +29,44 @@ Run your application like this:
LD_LIBRARY_PATH=<ZLUDA_DIRECTORY> <APPLICATION> <APPLICATIONS_ARGUMENTS> LD_LIBRARY_PATH=<ZLUDA_DIRECTORY> <APPLICATION> <APPLICATIONS_ARGUMENTS>
``` ```
where `<ZLUDA_DIRECTORY>` is the directory which contains ZLUDA-provided `libcuda.so`: `target/release` if you built from sources or `zluda` if you downloaded prebuilt package.
### MacOS ### MacOS
Not supported Not supported
## Building ## Building
**Warning**: ZLUDA is under heavy development (see news [here](https://vosen.github.io/ZLUDA/blog/zludas-third-life/)). Instructions below might not work.
_Note_: This repo has submodules. Make sure to recurse submodules when cloning this repo, e.g.: `git clone --recursive https://github.com/vosen/ZLUDA.git` ### Dependencies
You should have a relatively recent version of Rust installed, then you just do: * Git
* CMake
* Python 3
* Rust compiler (recent version)
* C++ compiler
* (Optional, but recommended) [Ninja build system](https://ninja-build.org/)
``` ### Build steps
cargo build --release
```
in the main directory of the project.
### Linux
If you are building on Linux you must also symlink (or rename) the ZLUDA output binaries after ZLUDA build finishes: * Git clone the repo (make sure to use `--recursive` option to fetch submodules):
``` `git clone --recursive https://github.com/vosen/ZLUDA.git`
ln -s libnvcuda.so target/release/libcuda.so * Enter freshly cloned `ZLUDA` directory and build with cargo (this takes a while):
ln -s libnvcuda.so target/release/libcuda.so.1 `cargo xtask --release`
ln -s libnvml.so target/release/libnvidia-ml.so
```
## Contributing ## Contributing
If you want to develop ZLUDA itself, read [CONTRIBUTING.md](CONTRIBUTING.md), it contains instructions how to set up dependencies and run tests ZLUDA project has a commercial backing and _does not_ accept donations.
ZLUDA project accepts pull requests and other non-monetary contributions.
If you want to contribute a code fix or documentation update feel free to open a Pull Request.
### Getting started
There's no architecture document (yet). Two most important crates in ZLUDA are `ptx` (PTX compiler) and `zluda` (AMD GPU runtime). A good starting point to tinkering the project is to run one of the `ptx` unit tests under a debugger and understand what it is doing. `cargo test -p ptx -- ::add_hip` is a simple test that adds two numbers.
Github issues tagged with ["help wanted"](https://github.com/vosen/ZLUDA/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) are tasks that are self-containted. Their level of difficulty varies, they are not always good beginner tasks, but they defined unambiguously.
If you have questions feel free to ask on [#devtalk channel on Discord](https://discord.com/channels/1273316903783497778/1303329281409159270).
## License ## License

View File

@ -133,21 +133,26 @@ pub fn compile_bitcode(
&linking_info, &linking_info,
amd_comgr_action_kind_t::AMD_COMGR_ACTION_LINK_BC_TO_BC, amd_comgr_action_kind_t::AMD_COMGR_ACTION_LINK_BC_TO_BC,
)?; )?;
let link_with_device_libs_info = ActionInfo::new()?; let compile_to_exec = ActionInfo::new()?;
link_with_device_libs_info.set_isa_name(gcn_arch)?; compile_to_exec.set_isa_name(gcn_arch)?;
link_with_device_libs_info.set_language(amd_comgr_language_t::AMD_COMGR_LANGUAGE_LLVM_IR)?; compile_to_exec.set_language(amd_comgr_language_t::AMD_COMGR_LANGUAGE_LLVM_IR)?;
// This makes no sense, but it makes ockl linking work let common_options = [
link_with_device_libs_info // This makes no sense, but it makes ockl linking work
.set_options([c"-Xclang", c"-mno-link-builtin-bitcode-postopt"].into_iter())?; c"-Xclang",
let with_device_libs = do_action( c"-mno-link-builtin-bitcode-postopt",
&linked_data_set, // Otherwise LLVM omits dynamic fp mode for ockl functions during linking
&link_with_device_libs_info, // and then fails to inline them
amd_comgr_action_kind_t::AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC, c"-Xclang",
)?; c"-fdenormal-fp-math=dynamic",
let compile_action_info = ActionInfo::new()?; c"-O3",
compile_action_info.set_isa_name(gcn_arch)?; c"-mno-wavefrontsize64",
let common_options = [c"-O3", c"-mno-wavefrontsize64", c"-mcumode"].into_iter(); c"-mcumode",
// Useful for inlining reports, combined with AMD_COMGR_SAVE_TEMPS=1 AMD_COMGR_EMIT_VERBOSE_LOGS=1 AMD_COMGR_REDIRECT_LOGS=stderr
// c"-fsave-optimization-record=yaml",
]
.into_iter();
let opt_options = if cfg!(debug_assertions) { let opt_options = if cfg!(debug_assertions) {
//[c"-g", c"-mllvm", c"-print-before-all", c"", c""]
[c"-g", c"", c"", c"", c""] [c"-g", c"", c"", c"", c""]
} else { } else {
[ [
@ -159,19 +164,14 @@ pub fn compile_bitcode(
c"-inlinehint-threshold=3250", c"-inlinehint-threshold=3250",
] ]
}; };
compile_action_info.set_options(common_options.chain(opt_options))?; compile_to_exec.set_options(common_options.chain(opt_options))?;
let reloc_data_set = do_action(
&with_device_libs,
&compile_action_info,
amd_comgr_action_kind_t::AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE,
)?;
let exec_data_set = do_action( let exec_data_set = do_action(
&reloc_data_set, &linked_data_set,
&compile_action_info, &compile_to_exec,
amd_comgr_action_kind_t::AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE, amd_comgr_action_kind_t::AMD_COMGR_ACTION_COMPILE_SOURCE_TO_EXECUTABLE,
)?; )?;
let executable = let executable =
exec_data_set.get_data(amd_comgr_data_kind_t::AMD_COMGR_DATA_KIND_EXECUTABLE, 0)?; exec_data_set.get_data(amd_comgr_data_kind_t::AMD_COMGR_DATA_KIND_EXECUTABLE, 0)?;
executable.copy_content() executable.copy_content()
} }

View File

@ -13,7 +13,7 @@ fn main() -> Result<(), VarError> {
println!("cargo:rustc-link-search=native=C:\\Windows\\System32"); println!("cargo:rustc-link-search=native=C:\\Windows\\System32");
}; };
} else { } else {
println!("cargo:rustc-link-lib=dylib=amd_comgr"); println!("cargo:rustc-link-lib=dylib:+verbatim=libamd_comgr.so.2");
println!("cargo:rustc-link-search=native=/opt/rocm/lib/"); println!("cargo:rustc-link-search=native=/opt/rocm/lib/");
} }
Ok(()) Ok(())

View File

@ -13,7 +13,7 @@ fn main() -> Result<(), VarError> {
println!("cargo:rustc-link-search=native=C:\\Windows\\System32"); println!("cargo:rustc-link-search=native=C:\\Windows\\System32");
}; };
} else { } else {
println!("cargo:rustc-link-lib=dylib=amdhip64"); println!("cargo:rustc-link-lib=dylib:+verbatim=libamdhip64.so.6");
println!("cargo:rustc-link-search=native=/opt/rocm/lib/"); println!("cargo:rustc-link-search=native=/opt/rocm/lib/");
} }
Ok(()) Ok(())

1
geekbench.svg Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 287 KiB

View File

@ -17,6 +17,10 @@ bitflags = "1.2"
rustc-hash = "2.0.0" rustc-hash = "2.0.0"
strum = "0.26" strum = "0.26"
strum_macros = "0.26" strum_macros = "0.26"
petgraph = "0.7.1"
microlp = "0.2.10"
int-enum = "1.1"
unwrap_or = "1.0.1"
[dev-dependencies] [dev-dependencies]
hip_runtime-sys = { path = "../ext/hip_runtime-sys" } hip_runtime-sys = { path = "../ext/hip_runtime-sys" }
@ -24,3 +28,4 @@ comgr = { path = "../comgr" }
tempfile = "3" tempfile = "3"
paste = "1.0" paste = "1.0"
cuda-driver-sys = "0.3.0" cuda-driver-sys = "0.3.0"
pretty_assertions = "1.4.1"

View File

@ -2,8 +2,8 @@ use super::*;
pub(super) fn run<'a, 'input>( pub(super) fn run<'a, 'input>(
resolver: &mut GlobalStringIdentResolver2<'input>, resolver: &mut GlobalStringIdentResolver2<'input>,
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> { ) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
directives directives
.into_iter() .into_iter()
.map(|directive| run_directive(resolver, directive)) .map(|directive| run_directive(resolver, directive))
@ -12,8 +12,8 @@ pub(super) fn run<'a, 'input>(
fn run_directive<'input>( fn run_directive<'input>(
resolver: &mut GlobalStringIdentResolver2, resolver: &mut GlobalStringIdentResolver2,
directive: Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, directive: Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> { ) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
Ok(match directive { Ok(match directive {
var @ Directive2::Variable(..) => var, var @ Directive2::Variable(..) => var,
Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?), Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?),
@ -22,13 +22,13 @@ fn run_directive<'input>(
fn run_method<'input>( fn run_method<'input>(
resolver: &mut GlobalStringIdentResolver2, resolver: &mut GlobalStringIdentResolver2,
mut method: Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>, mut method: Function2<ast::Instruction<SpirvWord>, SpirvWord>,
) -> Result<Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> { ) -> Result<Function2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
let is_declaration = method.body.is_none(); let is_declaration = method.body.is_none();
let mut body = Vec::new(); let mut body = Vec::new();
let mut remap_returns = Vec::new(); let mut remap_returns = Vec::new();
if !method.func_decl.name.is_kernel() { if !method.is_kernel {
for arg in method.func_decl.return_arguments.iter_mut() { for arg in method.return_arguments.iter_mut() {
match arg.state_space { match arg.state_space {
ptx_parser::StateSpace::Param => { ptx_parser::StateSpace::Param => {
arg.state_space = ptx_parser::StateSpace::Reg; arg.state_space = ptx_parser::StateSpace::Reg;
@ -51,7 +51,7 @@ fn run_method<'input>(
_ => return Err(error_unreachable()), _ => return Err(error_unreachable()),
} }
} }
for arg in method.func_decl.input_arguments.iter_mut() { for arg in method.input_arguments.iter_mut() {
match arg.state_space { match arg.state_space {
ptx_parser::StateSpace::Param => { ptx_parser::StateSpace::Param => {
arg.state_space = ptx_parser::StateSpace::Reg; arg.state_space = ptx_parser::StateSpace::Reg;
@ -95,14 +95,7 @@ fn run_method<'input>(
Ok::<_, TranslateError>(body) Ok::<_, TranslateError>(body)
}) })
.transpose()?; .transpose()?;
Ok(Function2 { Ok(Function2 { body, ..method })
func_decl: method.func_decl,
globals: method.globals,
body,
import_as: method.import_as,
tuning: method.tuning,
linkage: method.linkage,
})
} }
fn run_statement<'input>( fn run_statement<'input>(

View File

@ -36,6 +36,7 @@ use llvm_zluda::bit_writer::LLVMWriteBitcodeToMemoryBuffer;
use llvm_zluda::{core::*, *}; use llvm_zluda::{core::*, *};
use llvm_zluda::{prelude::*, LLVMZludaBuildAtomicRMW}; use llvm_zluda::{prelude::*, LLVMZludaBuildAtomicRMW};
use llvm_zluda::{LLVMCallConv, LLVMZludaBuildAlloca}; use llvm_zluda::{LLVMCallConv, LLVMZludaBuildAlloca};
use ptx_parser::Mul24Control;
const LLVM_UNNAMED: &CStr = c""; const LLVM_UNNAMED: &CStr = c"";
// https://llvm.org/docs/AMDGPUUsage.html#address-spaces // https://llvm.org/docs/AMDGPUUsage.html#address-spaces
@ -65,17 +66,24 @@ impl Drop for Context {
} }
} }
struct Module(LLVMModuleRef); pub struct Module(LLVMModuleRef, Context);
impl Module { impl Module {
fn new(ctx: &Context, name: &CStr) -> Self { fn new(ctx: Context, name: &CStr) -> Self {
Self(unsafe { LLVMModuleCreateWithNameInContext(name.as_ptr(), ctx.get()) }) Self(
unsafe { LLVMModuleCreateWithNameInContext(name.as_ptr(), ctx.get()) },
ctx,
)
} }
fn get(&self) -> LLVMModuleRef { fn get(&self) -> LLVMModuleRef {
self.0 self.0
} }
fn context(&self) -> &Context {
&self.1
}
fn verify(&self) -> Result<(), Message> { fn verify(&self) -> Result<(), Message> {
let mut err = ptr::null_mut(); let mut err = ptr::null_mut();
let error = unsafe { let error = unsafe {
@ -92,10 +100,15 @@ impl Module {
} }
} }
fn write_bitcode_to_memory(&self) -> MemoryBuffer { pub fn write_bitcode_to_memory(&self) -> MemoryBuffer {
let memory_buffer = unsafe { LLVMWriteBitcodeToMemoryBuffer(self.get()) }; let memory_buffer = unsafe { LLVMWriteBitcodeToMemoryBuffer(self.get()) };
MemoryBuffer(memory_buffer) MemoryBuffer(memory_buffer)
} }
pub fn print_module_to_string(&self) -> Message {
let asm = unsafe { LLVMPrintModuleToString(self.get()) };
Message(unsafe { CStr::from_ptr(asm) })
}
} }
impl Drop for Module { impl Drop for Module {
@ -130,7 +143,7 @@ impl Drop for Builder {
} }
} }
struct Message(&'static CStr); pub struct Message(&'static CStr);
impl Drop for Message { impl Drop for Message {
fn drop(&mut self) { fn drop(&mut self) {
@ -146,6 +159,12 @@ impl std::fmt::Debug for Message {
} }
} }
impl Message {
pub fn to_str(&self) -> &str {
self.0.to_str().unwrap().trim()
}
}
pub struct MemoryBuffer(LLVMMemoryBufferRef); pub struct MemoryBuffer(LLVMMemoryBufferRef);
impl Drop for MemoryBuffer { impl Drop for MemoryBuffer {
@ -168,11 +187,11 @@ impl Deref for MemoryBuffer {
pub(super) fn run<'input>( pub(super) fn run<'input>(
id_defs: GlobalStringIdentResolver2<'input>, id_defs: GlobalStringIdentResolver2<'input>,
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
) -> Result<MemoryBuffer, TranslateError> { ) -> Result<Module, TranslateError> {
let context = Context::new(); let context = Context::new();
let module = Module::new(&context, LLVM_UNNAMED); let module = Module::new(context, LLVM_UNNAMED);
let mut emit_ctx = ModuleEmitContext::new(&context, &module, &id_defs); let mut emit_ctx = ModuleEmitContext::new(&module, &id_defs);
for directive in directives { for directive in directives {
match directive { match directive {
Directive2::Variable(linking, variable) => emit_ctx.emit_global(linking, variable)?, Directive2::Variable(linking, variable) => emit_ctx.emit_global(linking, variable)?,
@ -182,7 +201,7 @@ pub(super) fn run<'input>(
if let Err(err) = module.verify() { if let Err(err) = module.verify() {
panic!("{:?}", err); panic!("{:?}", err);
} }
Ok(module.write_bitcode_to_memory()) Ok(module)
} }
struct ModuleEmitContext<'a, 'input> { struct ModuleEmitContext<'a, 'input> {
@ -194,11 +213,8 @@ struct ModuleEmitContext<'a, 'input> {
} }
impl<'a, 'input> ModuleEmitContext<'a, 'input> { impl<'a, 'input> ModuleEmitContext<'a, 'input> {
fn new( fn new(module: &Module, id_defs: &'a GlobalStringIdentResolver2<'input>) -> Self {
context: &Context, let context = module.context();
module: &Module,
id_defs: &'a GlobalStringIdentResolver2<'input>,
) -> Self {
ModuleEmitContext { ModuleEmitContext {
context: context.get(), context: context.get(),
module: module.get(), module: module.get(),
@ -218,24 +234,20 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
fn emit_method( fn emit_method(
&mut self, &mut self,
method: Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>, method: Function2<ast::Instruction<SpirvWord>, SpirvWord>,
) -> Result<(), TranslateError> { ) -> Result<(), TranslateError> {
let func_decl = method.func_decl;
let name = method let name = method
.import_as .import_as
.as_deref() .as_deref()
.or_else(|| match func_decl.name { .or_else(|| self.id_defs.ident_map[&method.name].name.as_deref())
ast::MethodName::Kernel(name) => Some(name),
ast::MethodName::Func(id) => self.id_defs.ident_map[&id].name.as_deref(),
})
.ok_or_else(|| error_unreachable())?; .ok_or_else(|| error_unreachable())?;
let name = CString::new(name).map_err(|_| error_unreachable())?; let name = CString::new(name).map_err(|_| error_unreachable())?;
let mut fn_ = unsafe { LLVMGetNamedFunction(self.module, name.as_ptr()) }; let mut fn_ = unsafe { LLVMGetNamedFunction(self.module, name.as_ptr()) };
if fn_ == ptr::null_mut() { if fn_ == ptr::null_mut() {
let fn_type = get_function_type( let fn_type = get_function_type(
self.context, self.context,
func_decl.return_arguments.iter().map(|v| &v.v_type), method.return_arguments.iter().map(|v| &v.v_type),
func_decl method
.input_arguments .input_arguments
.iter() .iter()
.map(|v| get_input_argument_type(self.context, &v.v_type, v.state_space)), .map(|v| get_input_argument_type(self.context, &v.v_type, v.state_space)),
@ -245,15 +257,28 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
self.emit_fn_attribute(fn_, "uniform-work-group-size", "true"); self.emit_fn_attribute(fn_, "uniform-work-group-size", "true");
self.emit_fn_attribute(fn_, "no-trapping-math", "true"); self.emit_fn_attribute(fn_, "no-trapping-math", "true");
} }
if let ast::MethodName::Func(name) = func_decl.name { if !method.is_kernel {
self.resolver.register(name, fn_); self.resolver.register(method.name, fn_);
self.emit_fn_attribute(fn_, "denormal-fp-math-f32", "dynamic");
self.emit_fn_attribute(fn_, "denormal-fp-math", "dynamic");
} else {
self.emit_fn_attribute(
fn_,
"denormal-fp-math-f32",
llvm_ftz(method.flush_to_zero_f32),
);
self.emit_fn_attribute(
fn_,
"denormal-fp-math",
llvm_ftz(method.flush_to_zero_f16f64),
);
} }
for (i, param) in func_decl.input_arguments.iter().enumerate() { for (i, param) in method.input_arguments.iter().enumerate() {
let value = unsafe { LLVMGetParam(fn_, i as u32) }; let value = unsafe { LLVMGetParam(fn_, i as u32) };
let name = self.resolver.get_or_add(param.name); let name = self.resolver.get_or_add(param.name);
unsafe { LLVMSetValueName2(value, name.as_ptr().cast(), name.len()) }; unsafe { LLVMSetValueName2(value, name.as_ptr().cast(), name.len()) };
self.resolver.register(param.name, value); self.resolver.register(param.name, value);
if func_decl.name.is_kernel() { if method.is_kernel {
let attr_kind = unsafe { let attr_kind = unsafe {
LLVMGetEnumAttributeKindForName(b"byref".as_ptr().cast(), b"byref".len()) LLVMGetEnumAttributeKindForName(b"byref".as_ptr().cast(), b"byref".len())
}; };
@ -267,7 +292,7 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
unsafe { LLVMAddAttributeAtIndex(fn_, i as u32 + 1, attr) }; unsafe { LLVMAddAttributeAtIndex(fn_, i as u32 + 1, attr) };
} }
} }
let call_conv = if func_decl.name.is_kernel() { let call_conv = if method.is_kernel {
Self::kernel_call_convention() Self::kernel_call_convention()
} else { } else {
Self::func_call_convention() Self::func_call_convention()
@ -282,7 +307,7 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
unsafe { LLVMAppendBasicBlockInContext(self.context, fn_, LLVM_UNNAMED.as_ptr()) }; unsafe { LLVMAppendBasicBlockInContext(self.context, fn_, LLVM_UNNAMED.as_ptr()) };
unsafe { LLVMPositionBuilderAtEnd(self.builder.get(), real_bb) }; unsafe { LLVMPositionBuilderAtEnd(self.builder.get(), real_bb) };
let mut method_emitter = MethodEmitContext::new(self, fn_, variables_builder); let mut method_emitter = MethodEmitContext::new(self, fn_, variables_builder);
for var in func_decl.return_arguments { for var in method.return_arguments {
method_emitter.emit_variable(var)?; method_emitter.emit_variable(var)?;
} }
for statement in statements.iter() { for statement in statements.iter() {
@ -290,6 +315,17 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
method_emitter.emit_label_initial(*label); method_emitter.emit_label_initial(*label);
} }
} }
let mut statements = statements.into_iter();
if let Some(Statement::Label(label)) = statements.next() {
method_emitter.emit_label_delayed(label)?;
} else {
return Err(error_unreachable());
}
method_emitter.emit_kernel_rounding_prelude(
method.is_kernel,
method.rounding_mode_f32,
method.rounding_mode_f16f64,
)?;
for statement in statements { for statement in statements {
method_emitter.emit_statement(statement)?; method_emitter.emit_statement(statement)?;
} }
@ -417,6 +453,14 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
} }
} }
fn llvm_ftz(ftz: bool) -> &'static str {
if ftz {
"preserve-sign"
} else {
"ieee"
}
}
fn get_input_argument_type( fn get_input_argument_type(
context: LLVMContextRef, context: LLVMContextRef,
v_type: &ast::Type, v_type: &ast::Type,
@ -473,9 +517,32 @@ impl<'a> MethodEmitContext<'a> {
Statement::FunctionPointer(_) => todo!(), Statement::FunctionPointer(_) => todo!(),
Statement::VectorRead(vector_read) => self.emit_vector_read(vector_read)?, Statement::VectorRead(vector_read) => self.emit_vector_read(vector_read)?,
Statement::VectorWrite(vector_write) => self.emit_vector_write(vector_write)?, Statement::VectorWrite(vector_write) => self.emit_vector_write(vector_write)?,
Statement::SetMode(mode_reg) => self.emit_set_mode(mode_reg)?,
}) })
} }
// This should be a kernel attribute, but sadly AMDGPU LLVM target does
// not support attribute for it. So we have to set it as the first
// instruction in the body of a kernel
fn emit_kernel_rounding_prelude(
&mut self,
is_kernel: bool,
rounding_mode_f32: ast::RoundingMode,
rounding_mode_f16f64: ast::RoundingMode,
) -> Result<(), TranslateError> {
if is_kernel {
if rounding_mode_f32 != ast::RoundingMode::NearestEven
|| rounding_mode_f16f64 != ast::RoundingMode::NearestEven
{
self.emit_set_mode(ModeRegister::Rounding {
f32: rounding_mode_f32,
f16f64: rounding_mode_f16f64,
})?;
}
}
Ok(())
}
fn emit_variable(&mut self, var: ast::Variable<SpirvWord>) -> Result<(), TranslateError> { fn emit_variable(&mut self, var: ast::Variable<SpirvWord>) -> Result<(), TranslateError> {
let alloca = unsafe { let alloca = unsafe {
LLVMZludaBuildAlloca( LLVMZludaBuildAlloca(
@ -528,6 +595,7 @@ impl<'a> MethodEmitContext<'a> {
ast::Instruction::Add { data, arguments } => self.emit_add(data, arguments), ast::Instruction::Add { data, arguments } => self.emit_add(data, arguments),
ast::Instruction::St { data, arguments } => self.emit_st(data, arguments), ast::Instruction::St { data, arguments } => self.emit_st(data, arguments),
ast::Instruction::Mul { data, arguments } => self.emit_mul(data, arguments), ast::Instruction::Mul { data, arguments } => self.emit_mul(data, arguments),
ast::Instruction::Mul24 { data, arguments } => self.emit_mul24(data, arguments),
ast::Instruction::Setp { data, arguments } => self.emit_setp(data, arguments), ast::Instruction::Setp { data, arguments } => self.emit_setp(data, arguments),
ast::Instruction::SetpBool { .. } => todo!(), ast::Instruction::SetpBool { .. } => todo!(),
ast::Instruction::Not { data, arguments } => self.emit_not(data, arguments), ast::Instruction::Not { data, arguments } => self.emit_not(data, arguments),
@ -1128,7 +1196,7 @@ impl<'a> MethodEmitContext<'a> {
let cos = self.emit_intrinsic( let cos = self.emit_intrinsic(
c"llvm.cos.f32", c"llvm.cos.f32",
Some(arguments.dst), Some(arguments.dst),
&ast::ScalarType::F32.into(), Some(&ast::ScalarType::F32.into()),
vec![(self.resolver.value(arguments.src)?, llvm_f32)], vec![(self.resolver.value(arguments.src)?, llvm_f32)],
)?; )?;
unsafe { LLVMZludaSetFastMathFlags(cos, LLVMZludaFastMathApproxFunc) } unsafe { LLVMZludaSetFastMathFlags(cos, LLVMZludaFastMathApproxFunc) }
@ -1381,7 +1449,7 @@ impl<'a> MethodEmitContext<'a> {
let sin = self.emit_intrinsic( let sin = self.emit_intrinsic(
c"llvm.sin.f32", c"llvm.sin.f32",
Some(arguments.dst), Some(arguments.dst),
&ast::ScalarType::F32.into(), Some(&ast::ScalarType::F32.into()),
vec![(self.resolver.value(arguments.src)?, llvm_f32)], vec![(self.resolver.value(arguments.src)?, llvm_f32)],
)?; )?;
unsafe { LLVMZludaSetFastMathFlags(sin, LLVMZludaFastMathApproxFunc) } unsafe { LLVMZludaSetFastMathFlags(sin, LLVMZludaFastMathApproxFunc) }
@ -1392,12 +1460,12 @@ impl<'a> MethodEmitContext<'a> {
&mut self, &mut self,
name: &CStr, name: &CStr,
dst: Option<SpirvWord>, dst: Option<SpirvWord>,
return_type: &ast::Type, return_type: Option<&ast::Type>,
arguments: Vec<(LLVMValueRef, LLVMTypeRef)>, arguments: Vec<(LLVMValueRef, LLVMTypeRef)>,
) -> Result<LLVMValueRef, TranslateError> { ) -> Result<LLVMValueRef, TranslateError> {
let fn_type = get_function_type( let fn_type = get_function_type(
self.context, self.context,
iter::once(return_type), return_type.into_iter(),
arguments.iter().map(|(_, type_)| Ok(*type_)), arguments.iter().map(|(_, type_)| Ok(*type_)),
)?; )?;
let mut fn_ = unsafe { LLVMGetNamedFunction(self.module, name.as_ptr()) }; let mut fn_ = unsafe { LLVMGetNamedFunction(self.module, name.as_ptr()) };
@ -1558,7 +1626,7 @@ impl<'a> MethodEmitContext<'a> {
return self.emit_cvt_float_to_int( return self.emit_cvt_float_to_int(
data.from, data.from,
data.to, data.to,
integer_rounding.unwrap_or(ast::RoundingMode::NearestEven), integer_rounding,
arguments, arguments,
Some(LLVMBuildFPToSI), Some(LLVMBuildFPToSI),
) )
@ -1616,7 +1684,7 @@ impl<'a> MethodEmitContext<'a> {
let clamped = self.emit_intrinsic( let clamped = self.emit_intrinsic(
c"llvm.umin", c"llvm.umin",
None, None,
&from.into(), Some(&from.into()),
vec![ vec![
(self.resolver.value(arguments.src)?, from_llvm), (self.resolver.value(arguments.src)?, from_llvm),
(max, from_llvm), (max, from_llvm),
@ -1646,7 +1714,7 @@ impl<'a> MethodEmitContext<'a> {
let zero_clamped = self.emit_intrinsic( let zero_clamped = self.emit_intrinsic(
unsafe { CStr::from_bytes_with_nul_unchecked(zero_clamp_intrinsic.as_bytes()) }, unsafe { CStr::from_bytes_with_nul_unchecked(zero_clamp_intrinsic.as_bytes()) },
None, None,
&from.into(), Some(&from.into()),
vec![ vec![
(self.resolver.value(arguments.src)?, from_llvm), (self.resolver.value(arguments.src)?, from_llvm),
(zero, from_llvm), (zero, from_llvm),
@ -1665,7 +1733,7 @@ impl<'a> MethodEmitContext<'a> {
let fully_clamped = self.emit_intrinsic( let fully_clamped = self.emit_intrinsic(
unsafe { CStr::from_bytes_with_nul_unchecked(max_clamp_intrinsic.as_bytes()) }, unsafe { CStr::from_bytes_with_nul_unchecked(max_clamp_intrinsic.as_bytes()) },
None, None,
&from.into(), Some(&from.into()),
vec![(zero_clamped, from_llvm), (max, from_llvm)], vec![(zero_clamped, from_llvm), (max, from_llvm)],
)?; )?;
let resize_fn = if to.layout().size() >= from.layout().size() { let resize_fn = if to.layout().size() >= from.layout().size() {
@ -1705,7 +1773,7 @@ impl<'a> MethodEmitContext<'a> {
let rounded_float = self.emit_intrinsic( let rounded_float = self.emit_intrinsic(
unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) }, unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
None, None,
&from.into(), Some(&from.into()),
vec![( vec![(
self.resolver.value(arguments.src)?, self.resolver.value(arguments.src)?,
get_scalar_type(self.context, from), get_scalar_type(self.context, from),
@ -1774,7 +1842,7 @@ impl<'a> MethodEmitContext<'a> {
self.emit_intrinsic( self.emit_intrinsic(
intrinsic, intrinsic,
Some(arguments.dst), Some(arguments.dst),
&data.type_.into(), Some(&data.type_.into()),
vec![(self.resolver.value(arguments.src)?, type_)], vec![(self.resolver.value(arguments.src)?, type_)],
)?; )?;
Ok(()) Ok(())
@ -1795,7 +1863,7 @@ impl<'a> MethodEmitContext<'a> {
self.emit_intrinsic( self.emit_intrinsic(
intrinsic, intrinsic,
Some(arguments.dst), Some(arguments.dst),
&data.type_.into(), Some(&data.type_.into()),
vec![(self.resolver.value(arguments.src)?, type_)], vec![(self.resolver.value(arguments.src)?, type_)],
)?; )?;
Ok(()) Ok(())
@ -1817,7 +1885,7 @@ impl<'a> MethodEmitContext<'a> {
self.emit_intrinsic( self.emit_intrinsic(
intrinsic, intrinsic,
Some(arguments.dst), Some(arguments.dst),
&data.type_.into(), Some(&data.type_.into()),
vec![(self.resolver.value(arguments.src)?, type_)], vec![(self.resolver.value(arguments.src)?, type_)],
)?; )?;
Ok(()) Ok(())
@ -1939,7 +2007,7 @@ impl<'a> MethodEmitContext<'a> {
self.emit_intrinsic( self.emit_intrinsic(
intrinsic, intrinsic,
Some(arguments.dst), Some(arguments.dst),
&data.type_.into(), Some(&data.type_.into()),
vec![( vec![(
self.resolver.value(arguments.src)?, self.resolver.value(arguments.src)?,
get_scalar_type(self.context, data.type_), get_scalar_type(self.context, data.type_),
@ -1956,7 +2024,7 @@ impl<'a> MethodEmitContext<'a> {
self.emit_intrinsic( self.emit_intrinsic(
c"llvm.amdgcn.log.f32", c"llvm.amdgcn.log.f32",
Some(arguments.dst), Some(arguments.dst),
&ast::ScalarType::F32.into(), Some(&ast::ScalarType::F32.into()),
vec![( vec![(
self.resolver.value(arguments.src)?, self.resolver.value(arguments.src)?,
get_scalar_type(self.context, ast::ScalarType::F32.into()), get_scalar_type(self.context, ast::ScalarType::F32.into()),
@ -2011,7 +2079,7 @@ impl<'a> MethodEmitContext<'a> {
self.emit_intrinsic( self.emit_intrinsic(
intrinsic, intrinsic,
Some(arguments.dst), Some(arguments.dst),
&type_.into(), Some(&type_.into()),
vec![(self.resolver.value(arguments.src)?, llvm_type)], vec![(self.resolver.value(arguments.src)?, llvm_type)],
)?; )?;
Ok(()) Ok(())
@ -2035,7 +2103,7 @@ impl<'a> MethodEmitContext<'a> {
self.emit_intrinsic( self.emit_intrinsic(
unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) }, unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
Some(arguments.dst), Some(arguments.dst),
&data.type_().into(), Some(&data.type_().into()),
vec![ vec![
(self.resolver.value(arguments.src1)?, llvm_type), (self.resolver.value(arguments.src1)?, llvm_type),
(self.resolver.value(arguments.src2)?, llvm_type), (self.resolver.value(arguments.src2)?, llvm_type),
@ -2062,7 +2130,7 @@ impl<'a> MethodEmitContext<'a> {
self.emit_intrinsic( self.emit_intrinsic(
unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) }, unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
Some(arguments.dst), Some(arguments.dst),
&data.type_().into(), Some(&data.type_().into()),
vec![ vec![
(self.resolver.value(arguments.src1)?, llvm_type), (self.resolver.value(arguments.src1)?, llvm_type),
(self.resolver.value(arguments.src2)?, llvm_type), (self.resolver.value(arguments.src2)?, llvm_type),
@ -2080,7 +2148,7 @@ impl<'a> MethodEmitContext<'a> {
self.emit_intrinsic( self.emit_intrinsic(
unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) }, unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
Some(arguments.dst), Some(arguments.dst),
&data.type_.into(), Some(&data.type_.into()),
vec![ vec![
( (
self.resolver.value(arguments.src1)?, self.resolver.value(arguments.src1)?,
@ -2201,12 +2269,118 @@ impl<'a> MethodEmitContext<'a> {
self.emit_intrinsic( self.emit_intrinsic(
unsafe { CStr::from_bytes_with_nul_unchecked(llvm_intrinsic.as_bytes()) }, unsafe { CStr::from_bytes_with_nul_unchecked(llvm_intrinsic.as_bytes()) },
Some(arguments.dst), Some(arguments.dst),
&data.type_.into(), Some(&data.type_.into()),
intrinsic_arguments, intrinsic_arguments,
)?; )?;
Ok(()) Ok(())
} }
fn emit_mul24(
&mut self,
data: ast::Mul24Details,
arguments: ast::Mul24Args<SpirvWord>,
) -> Result<(), TranslateError> {
let src1 = self.resolver.value(arguments.src1)?;
let src2 = self.resolver.value(arguments.src2)?;
let name_lo = match data.type_ {
ast::ScalarType::U32 => c"llvm.amdgcn.mul.u24",
ast::ScalarType::S32 => c"llvm.amdgcn.mul.i24",
_ => return Err(error_unreachable()),
};
let res_lo = self.emit_intrinsic(
name_lo,
if data.control == Mul24Control::Lo { Some(arguments.dst) } else { None },
Some(&ast::Type::Scalar(data.type_)),
vec![
(src1, get_scalar_type(self.context, data.type_)),
(src2, get_scalar_type(self.context, data.type_)),
],
)?;
if data.control == Mul24Control::Hi {
// There is an important difference between NVIDIA's mul24.hi and AMD's mulhi.[ui]24.
// NVIDIA: Returns bits 47..16 of the 64-bit result
// AMD: Returns bits 63..32 of the 64-bit result
// Hence we need to compute both hi and lo, shift the results and add them together to replicate NVIDIA's mul24
let name_hi = match data.type_ {
ast::ScalarType::U32 => c"llvm.amdgcn.mulhi.u24",
ast::ScalarType::S32 => c"llvm.amdgcn.mulhi.i24",
_ => return Err(error_unreachable()),
};
let res_hi = self.emit_intrinsic(
name_hi,
None,
Some(&ast::Type::Scalar(data.type_)),
vec![
(src1, get_scalar_type(self.context, data.type_)),
(src2, get_scalar_type(self.context, data.type_)),
],
)?;
let shift_number = unsafe { LLVMConstInt(LLVMInt32TypeInContext(self.context), 16, 0) };
let res_lo_shr = unsafe {
LLVMBuildLShr(self.builder, res_lo, shift_number, LLVM_UNNAMED.as_ptr())
};
let res_hi_shl =
unsafe { LLVMBuildShl(self.builder, res_hi, shift_number, LLVM_UNNAMED.as_ptr()) };
self.resolver
.with_result(arguments.dst, |dst: *const i8| unsafe {
LLVMBuildOr(self.builder, res_lo_shr, res_hi_shl, dst)
});
}
Ok(())
}
fn emit_set_mode(&mut self, mode_reg: ModeRegister) -> Result<(), TranslateError> {
fn hwreg(reg: u32, offset: u32, size: u32) -> u32 {
reg | (offset << 6) | ((size - 1) << 11)
}
fn denormal_to_value(ftz: bool) -> u32 {
if ftz {
0
} else {
3
}
}
fn rounding_to_value(ftz: ast::RoundingMode) -> u32 {
match ftz {
ptx_parser::RoundingMode::NearestEven => 0,
ptx_parser::RoundingMode::Zero => 3,
ptx_parser::RoundingMode::NegativeInf => 2,
ptx_parser::RoundingMode::PositiveInf => 1,
}
}
fn merge_regs(f32: u32, f16f64: u32) -> u32 {
f32 | f16f64 << 2
}
let intrinsic = c"llvm.amdgcn.s.setreg";
let (hwreg, value) = match mode_reg {
ModeRegister::Denormal { f32, f16f64 } => {
let hwreg = hwreg(1, 4, 4);
let f32 = denormal_to_value(f32);
let f16f64 = denormal_to_value(f16f64);
let value = merge_regs(f32, f16f64);
(hwreg, value)
}
ModeRegister::Rounding { f32, f16f64 } => {
let hwreg = hwreg(1, 0, 4);
let f32 = rounding_to_value(f32);
let f16f64 = rounding_to_value(f16f64);
let value = merge_regs(f32, f16f64);
(hwreg, value)
}
};
let llvm_i32 = get_scalar_type(self.context, ast::ScalarType::B32);
let hwreg_llvm = unsafe { LLVMConstInt(llvm_i32, hwreg as _, 0) };
let value_llvm = unsafe { LLVMConstInt(llvm_i32, value as _, 0) };
self.emit_intrinsic(
intrinsic,
None,
None,
vec![(hwreg_llvm, llvm_i32), (value_llvm, llvm_i32)],
)?;
Ok(())
}
/* /*
// Currently unused, LLVM 18 (ROCm 6.2) does not support `llvm.set.rounding` // Currently unused, LLVM 18 (ROCm 6.2) does not support `llvm.set.rounding`
// Should be available in LLVM 19 // Should be available in LLVM 19

View File

@ -2,8 +2,8 @@ use super::*;
pub(super) fn run<'a, 'input>( pub(super) fn run<'a, 'input>(
resolver: &mut GlobalStringIdentResolver2<'input>, resolver: &mut GlobalStringIdentResolver2<'input>,
directives: Vec<UnconditionalDirective<'input>>, directives: Vec<UnconditionalDirective>,
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> { ) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
directives directives
.into_iter() .into_iter()
.map(|directive| run_directive(resolver, directive)) .map(|directive| run_directive(resolver, directive))
@ -13,11 +13,10 @@ pub(super) fn run<'a, 'input>(
fn run_directive<'input>( fn run_directive<'input>(
resolver: &mut GlobalStringIdentResolver2<'input>, resolver: &mut GlobalStringIdentResolver2<'input>,
directive: Directive2< directive: Directive2<
'input,
ast::Instruction<ast::ParsedOperand<SpirvWord>>, ast::Instruction<ast::ParsedOperand<SpirvWord>>,
ast::ParsedOperand<SpirvWord>, ast::ParsedOperand<SpirvWord>,
>, >,
) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> { ) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
Ok(match directive { Ok(match directive {
Directive2::Variable(linking, var) => Directive2::Variable(linking, var), Directive2::Variable(linking, var) => Directive2::Variable(linking, var),
Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?), Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?),
@ -27,11 +26,10 @@ fn run_directive<'input>(
fn run_method<'input>( fn run_method<'input>(
resolver: &mut GlobalStringIdentResolver2<'input>, resolver: &mut GlobalStringIdentResolver2<'input>,
method: Function2< method: Function2<
'input,
ast::Instruction<ast::ParsedOperand<SpirvWord>>, ast::Instruction<ast::ParsedOperand<SpirvWord>>,
ast::ParsedOperand<SpirvWord>, ast::ParsedOperand<SpirvWord>,
>, >,
) -> Result<Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> { ) -> Result<Function2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
let body = method let body = method
.body .body
.map(|statements| { .map(|statements| {
@ -43,12 +41,18 @@ fn run_method<'input>(
}) })
.transpose()?; .transpose()?;
Ok(Function2 { Ok(Function2 {
func_decl: method.func_decl,
globals: method.globals,
body, body,
return_arguments: method.return_arguments,
name: method.name,
input_arguments: method.input_arguments,
import_as: method.import_as, import_as: method.import_as,
tuning: method.tuning, tuning: method.tuning,
linkage: method.linkage, linkage: method.linkage,
is_kernel: method.is_kernel,
flush_to_zero_f32: method.flush_to_zero_f32,
flush_to_zero_f16f64: method.flush_to_zero_f16f64,
rounding_mode_f32: method.rounding_mode_f32,
rounding_mode_f16f64: method.rounding_mode_f16f64,
}) })
} }

View File

@ -1,30 +1,33 @@
use super::*; use super::*;
pub(super) fn run<'a, 'input>( pub(super) fn run<'a, 'input>(
resolver: &mut GlobalStringIdentResolver2<'input>, resolver: &'a mut GlobalStringIdentResolver2<'input>,
special_registers: &'a SpecialRegistersMap2, special_registers: &'a SpecialRegistersMap2,
directives: Vec<UnconditionalDirective<'input>>, directives: Vec<UnconditionalDirective>,
) -> Result<Vec<UnconditionalDirective<'input>>, TranslateError> { ) -> Result<Vec<UnconditionalDirective>, TranslateError> {
let declarations = SpecialRegistersMap2::generate_declarations(resolver); let mut result = Vec::with_capacity(SpecialRegistersMap2::len() + directives.len());
let mut result = Vec::with_capacity(declarations.len() + directives.len());
let mut sreg_to_function = let mut sreg_to_function =
FxHashMap::with_capacity_and_hasher(declarations.len(), Default::default()); FxHashMap::with_capacity_and_hasher(SpecialRegistersMap2::len(), Default::default());
for (sreg, declaration) in declarations { SpecialRegistersMap2::foreach_declaration(
let name = if let ast::MethodName::Func(name) = declaration.name { resolver,
name |sreg, (return_arguments, name, input_arguments)| {
} else { result.push(UnconditionalDirective::Method(UnconditionalFunction {
return Err(error_unreachable()); return_arguments,
}; name,
result.push(UnconditionalDirective::Method(UnconditionalFunction { input_arguments,
func_decl: declaration, body: None,
globals: Vec::new(), import_as: None,
body: None, tuning: Vec::new(),
import_as: None, linkage: ast::LinkingDirective::EXTERN,
tuning: Vec::new(), is_kernel: false,
linkage: ast::LinkingDirective::EXTERN, flush_to_zero_f32: false,
})); flush_to_zero_f16f64: false,
sreg_to_function.insert(sreg, name); rounding_mode_f32: ptx_parser::RoundingMode::NearestEven,
} rounding_mode_f16f64: ptx_parser::RoundingMode::NearestEven,
}));
sreg_to_function.insert(sreg, name);
},
);
let mut visitor = SpecialRegisterResolver { let mut visitor = SpecialRegisterResolver {
resolver, resolver,
special_registers, special_registers,
@ -39,8 +42,8 @@ pub(super) fn run<'a, 'input>(
fn run_directive<'a, 'input>( fn run_directive<'a, 'input>(
visitor: &mut SpecialRegisterResolver<'a, 'input>, visitor: &mut SpecialRegisterResolver<'a, 'input>,
directive: UnconditionalDirective<'input>, directive: UnconditionalDirective,
) -> Result<UnconditionalDirective<'input>, TranslateError> { ) -> Result<UnconditionalDirective, TranslateError> {
Ok(match directive { Ok(match directive {
var @ Directive2::Variable(..) => var, var @ Directive2::Variable(..) => var,
Directive2::Method(method) => Directive2::Method(run_method(visitor, method)?), Directive2::Method(method) => Directive2::Method(run_method(visitor, method)?),
@ -49,8 +52,8 @@ fn run_directive<'a, 'input>(
fn run_method<'a, 'input>( fn run_method<'a, 'input>(
visitor: &mut SpecialRegisterResolver<'a, 'input>, visitor: &mut SpecialRegisterResolver<'a, 'input>,
method: UnconditionalFunction<'input>, method: UnconditionalFunction,
) -> Result<UnconditionalFunction<'input>, TranslateError> { ) -> Result<UnconditionalFunction, TranslateError> {
let body = method let body = method
.body .body
.map(|statements| { .map(|statements| {
@ -61,14 +64,7 @@ fn run_method<'a, 'input>(
Ok::<_, TranslateError>(result) Ok::<_, TranslateError>(result)
}) })
.transpose()?; .transpose()?;
Ok(Function2 { Ok(Function2 { body, ..method })
func_decl: method.func_decl,
globals: method.globals,
body,
import_as: method.import_as,
tuning: method.tuning,
linkage: method.linkage,
})
} }
fn run_statement<'a, 'input>( fn run_statement<'a, 'input>(

View File

@ -1,8 +1,8 @@
use super::*; use super::*;
pub(super) fn run<'input>( pub(super) fn run<'input>(
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> { ) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
let mut result = Vec::with_capacity(directives.len()); let mut result = Vec::with_capacity(directives.len());
for mut directive in directives.into_iter() { for mut directive in directives.into_iter() {
run_directive(&mut result, &mut directive)?; run_directive(&mut result, &mut directive)?;
@ -12,8 +12,8 @@ pub(super) fn run<'input>(
} }
fn run_directive<'input>( fn run_directive<'input>(
result: &mut Vec<Directive2<'input, ptx_parser::Instruction<SpirvWord>, SpirvWord>>, result: &mut Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
directive: &mut Directive2<'input, ptx_parser::Instruction<SpirvWord>, SpirvWord>, directive: &mut Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
) -> Result<(), TranslateError> { ) -> Result<(), TranslateError> {
match directive { match directive {
Directive2::Variable(..) => {} Directive2::Variable(..) => {}
@ -23,8 +23,8 @@ fn run_directive<'input>(
} }
fn run_function<'input>( fn run_function<'input>(
result: &mut Vec<Directive2<'input, ptx_parser::Instruction<SpirvWord>, SpirvWord>>, result: &mut Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
function: &mut Function2<'input, ptx_parser::Instruction<SpirvWord>, SpirvWord>, function: &mut Function2<ast::Instruction<SpirvWord>, SpirvWord>,
) { ) {
function.body = function.body.take().map(|statements| { function.body = function.body.take().map(|statements| {
statements statements

View File

@ -11,8 +11,8 @@ use super::*;
// pass, so we do nothing there // pass, so we do nothing there
pub(super) fn run<'a, 'input>( pub(super) fn run<'a, 'input>(
resolver: &mut GlobalStringIdentResolver2<'input>, resolver: &mut GlobalStringIdentResolver2<'input>,
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> { ) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
directives directives
.into_iter() .into_iter()
.map(|directive| run_directive(resolver, directive)) .map(|directive| run_directive(resolver, directive))
@ -21,8 +21,8 @@ pub(super) fn run<'a, 'input>(
fn run_directive<'a, 'input>( fn run_directive<'a, 'input>(
resolver: &mut GlobalStringIdentResolver2<'input>, resolver: &mut GlobalStringIdentResolver2<'input>,
directive: Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, directive: Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> { ) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
Ok(match directive { Ok(match directive {
var @ Directive2::Variable(..) => var, var @ Directive2::Variable(..) => var,
Directive2::Method(method) => { Directive2::Method(method) => {
@ -34,12 +34,11 @@ fn run_directive<'a, 'input>(
fn run_method<'a, 'input>( fn run_method<'a, 'input>(
mut visitor: InsertMemSSAVisitor<'a, 'input>, mut visitor: InsertMemSSAVisitor<'a, 'input>,
method: Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>, mut method: Function2<ast::Instruction<SpirvWord>, SpirvWord>,
) -> Result<Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> { ) -> Result<Function2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
let mut func_decl = method.func_decl; let is_kernel = method.is_kernel;
let is_kernel = func_decl.name.is_kernel();
if is_kernel { if is_kernel {
for arg in func_decl.input_arguments.iter_mut() { for arg in method.input_arguments.iter_mut() {
let old_name = arg.name; let old_name = arg.name;
let old_space = arg.state_space; let old_space = arg.state_space;
let new_space = ast::StateSpace::ParamEntry; let new_space = ast::StateSpace::ParamEntry;
@ -51,10 +50,10 @@ fn run_method<'a, 'input>(
arg.state_space = new_space; arg.state_space = new_space;
} }
}; };
for arg in func_decl.return_arguments.iter_mut() { for arg in method.return_arguments.iter_mut() {
visitor.visit_variable(arg)?; visitor.visit_variable(arg)?;
} }
let return_arguments = &func_decl.return_arguments[..]; let return_arguments = &method.return_arguments[..];
let body = method let body = method
.body .body
.map(move |statements| { .map(move |statements| {
@ -65,14 +64,7 @@ fn run_method<'a, 'input>(
Ok::<_, TranslateError>(result) Ok::<_, TranslateError>(result)
}) })
.transpose()?; .transpose()?;
Ok(Function2 { Ok(Function2 { body, ..method })
func_decl: func_decl,
globals: method.globals,
body,
import_as: method.import_as,
tuning: method.tuning,
linkage: method.linkage,
})
} }
fn run_statement<'a, 'input>( fn run_statement<'a, 'input>(

View File

@ -19,8 +19,8 @@ use ptx_parser as ast;
*/ */
pub(super) fn run<'input>( pub(super) fn run<'input>(
resolver: &mut GlobalStringIdentResolver2<'input>, resolver: &mut GlobalStringIdentResolver2<'input>,
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> { ) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
directives directives
.into_iter() .into_iter()
.map(|directive| run_directive(resolver, directive)) .map(|directive| run_directive(resolver, directive))
@ -29,8 +29,8 @@ pub(super) fn run<'input>(
fn run_directive<'a, 'input>( fn run_directive<'a, 'input>(
resolver: &mut GlobalStringIdentResolver2<'input>, resolver: &mut GlobalStringIdentResolver2<'input>,
directive: Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, directive: Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> { ) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
Ok(match directive { Ok(match directive {
var @ Directive2::Variable(..) => var, var @ Directive2::Variable(..) => var,
Directive2::Method(mut method) => { Directive2::Method(mut method) => {

View File

@ -0,0 +1,29 @@
.version 6.5
.target sm_50
.address_size 64
.func use_modes();
.visible .entry kernel()
{
.reg .f32 temp;
add.rz.ftz.f32 temp, temp, temp;
call use_modes;
add.rp.ftz.f32 temp, temp, temp;
ret;
}
.func use_modes()
{
.reg .f32 temp;
.reg .pred pred;
@pred bra SET_RM;
@!pred bra SET_RZ;
SET_RM:
add.rm.f32 temp, temp, temp;
ret;
SET_RZ:
add.rz.f32 temp, temp, temp;
ret;
}

View File

@ -0,0 +1,15 @@
.version 6.5
.target sm_30
.address_size 64
.visible .entry add()
{
.reg .f32 temp<3>;
add.ftz.f16 temp2, temp1, temp0;
add.ftz.f32 temp2, temp1, temp0;
add.f16 temp2, temp1, temp0;
add.f32 temp2, temp1, temp0;
ret;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,399 @@
use super::*;
use int_enum::IntEnum;
use strum::EnumCount;
#[repr(usize)]
#[derive(IntEnum, Eq, PartialEq, Copy, Clone, Debug)]
enum Bool {
False = 0,
True = 1,
}
fn ftz() -> InstructionModes {
InstructionModes {
denormal_f32: Some(DenormalMode::FlushToZero),
denormal_f16f64: None,
rounding_f32: None,
rounding_f16f64: None,
}
}
fn preserve() -> InstructionModes {
InstructionModes {
denormal_f32: Some(DenormalMode::Preserve),
denormal_f16f64: None,
rounding_f32: None,
rounding_f16f64: None,
}
}
#[test]
fn transitive_mixed() {
let mut graph = ControlFlowGraph::new();
let entry_id = SpirvWord(1);
let false_id = SpirvWord(2);
let empty_id = SpirvWord(3);
let false2_id = SpirvWord(4);
let entry = graph.add_entry_basic_block(entry_id);
graph.add_jump(entry, false_id);
let false_ = graph.get_or_add_basic_block(false_id);
graph.set_modes(false_, ftz(), ftz());
graph.add_jump(false_, empty_id);
let empty = graph.get_or_add_basic_block(empty_id);
graph.add_jump(empty, false2_id);
let false2_ = graph.get_or_add_basic_block(false2_id);
graph.set_modes(false2_, ftz(), ftz());
let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32);
assert_eq!(partial_result.bb_must_insert_mode.len(), 0);
assert_eq!(partial_result.bb_maybe_insert_mode.len(), 1);
assert_eq!(
partial_result.bb_maybe_insert_mode[&false_id],
(DenormalMode::FlushToZero, iter::once(entry_id).collect())
);
let result = optimize_mode_insertions::<DenormalMode, { DenormalMode::COUNT }>(partial_result);
assert_eq!(result.basic_blocks.len(), 0);
assert_eq!(result.kernels.len(), 1);
assert_eq!(result.kernels[&entry_id], DenormalMode::FlushToZero);
}
#[test]
fn transitive_change_twice() {
let mut graph = ControlFlowGraph::new();
let entry_id = SpirvWord(1);
let false_id = SpirvWord(2);
let empty_id = SpirvWord(3);
let true_id = SpirvWord(4);
let entry = graph.add_entry_basic_block(entry_id);
graph.add_jump(entry, false_id);
let false_ = graph.get_or_add_basic_block(false_id);
graph.set_modes(false_, ftz(), ftz());
graph.add_jump(false_, empty_id);
let empty = graph.get_or_add_basic_block(empty_id);
graph.add_jump(empty, true_id);
let true_ = graph.get_or_add_basic_block(true_id);
graph.set_modes(true_, preserve(), preserve());
let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32);
assert_eq!(partial_result.bb_must_insert_mode.len(), 1);
assert!(partial_result.bb_must_insert_mode.contains(&true_id));
assert_eq!(partial_result.bb_maybe_insert_mode.len(), 1);
assert_eq!(
partial_result.bb_maybe_insert_mode[&false_id],
(DenormalMode::FlushToZero, iter::once(entry_id).collect())
);
let result = optimize_mode_insertions::<DenormalMode, { DenormalMode::COUNT }>(partial_result);
assert_eq!(result.basic_blocks, iter::once(true_id).collect());
assert_eq!(result.kernels.len(), 1);
assert_eq!(result.kernels[&entry_id], DenormalMode::FlushToZero);
}
#[test]
fn transitive_change() {
let mut graph = ControlFlowGraph::new();
let entry_id = SpirvWord(1);
let empty_id = SpirvWord(2);
let true_id = SpirvWord(3);
let entry = graph.add_entry_basic_block(entry_id);
graph.add_jump(entry, empty_id);
let empty = graph.get_or_add_basic_block(empty_id);
graph.add_jump(empty, true_id);
let true_ = graph.get_or_add_basic_block(true_id);
graph.set_modes(true_, preserve(), preserve());
let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32);
assert_eq!(partial_result.bb_must_insert_mode.len(), 0);
assert_eq!(partial_result.bb_maybe_insert_mode.len(), 1);
assert_eq!(
partial_result.bb_maybe_insert_mode[&true_id],
(DenormalMode::Preserve, iter::once(entry_id).collect())
);
let result = optimize_mode_insertions::<DenormalMode, { DenormalMode::COUNT }>(partial_result);
assert_eq!(result.basic_blocks.len(), 0);
assert_eq!(result.kernels.len(), 1);
assert_eq!(result.kernels[&entry_id], DenormalMode::Preserve);
}
#[test]
fn codependency() {
let mut graph = ControlFlowGraph::new();
let entry_id = SpirvWord(1);
let left_f_id = SpirvWord(2);
let right_f_id = SpirvWord(3);
let left_none_id = SpirvWord(4);
let mid_none_id = SpirvWord(5);
let right_none_id = SpirvWord(6);
let entry = graph.add_entry_basic_block(entry_id);
graph.add_jump(entry, left_f_id);
graph.add_jump(entry, right_f_id);
let left_f = graph.get_or_add_basic_block(left_f_id);
graph.set_modes(left_f, ftz(), ftz());
let right_f = graph.get_or_add_basic_block(right_f_id);
graph.set_modes(right_f, ftz(), ftz());
graph.add_jump(left_f, left_none_id);
let left_none = graph.get_or_add_basic_block(left_none_id);
graph.add_jump(right_f, right_none_id);
let right_none = graph.get_or_add_basic_block(right_none_id);
graph.add_jump(left_none, mid_none_id);
graph.add_jump(right_none, mid_none_id);
let mid_none = graph.get_or_add_basic_block(mid_none_id);
graph.add_jump(mid_none, left_none_id);
graph.add_jump(mid_none, right_none_id);
//println!(
// "{:?}",
// petgraph::dot::Dot::with_config(&graph.graph, &[petgraph::dot::Config::EdgeNoLabel])
//);
let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32);
assert_eq!(partial_result.bb_must_insert_mode.len(), 0);
assert_eq!(partial_result.bb_maybe_insert_mode.len(), 2);
assert_eq!(
partial_result.bb_maybe_insert_mode[&left_f_id],
(DenormalMode::FlushToZero, iter::once(entry_id).collect())
);
assert_eq!(
partial_result.bb_maybe_insert_mode[&right_f_id],
(DenormalMode::FlushToZero, iter::once(entry_id).collect())
);
let result = optimize_mode_insertions::<DenormalMode, { DenormalMode::COUNT }>(partial_result);
assert_eq!(result.basic_blocks.len(), 0);
assert_eq!(result.kernels.len(), 1);
assert_eq!(result.kernels[&entry_id], DenormalMode::FlushToZero);
}
static FOLD_DENORMAL_PTX: &'static str = include_str!("fold_denormal.ptx");
#[test]
fn fold_denormal() {
let method = compile_methods(FOLD_DENORMAL_PTX).pop().unwrap();
assert_eq!(true, method.flush_to_zero_f32);
assert_eq!(true, method.flush_to_zero_f16f64);
let method_body = method.body.unwrap();
assert!(matches!(
&*method_body,
[
Statement::Label(..),
Statement::Variable(..),
Statement::Variable(..),
Statement::Variable(..),
Statement::Instruction(ast::Instruction::Add { .. }),
Statement::Instruction(ast::Instruction::Add { .. }),
Statement::SetMode(ModeRegister::Denormal {
f32: false,
f16f64: false
}),
Statement::Instruction(ast::Instruction::Add { .. }),
Statement::Instruction(ast::Instruction::Add { .. }),
Statement::Instruction(ast::Instruction::Ret { .. }),
]
));
}
fn compile_methods(ptx: &str) -> Vec<Function2<ast::Instruction<SpirvWord>, SpirvWord>> {
use crate::pass::*;
let module = ptx_parser::parse_module_checked(ptx).unwrap();
let mut flat_resolver = GlobalStringIdentResolver2::new(SpirvWord(1));
let mut scoped_resolver = ScopedResolver::new(&mut flat_resolver);
let directives = normalize_identifiers2::run(&mut scoped_resolver, module.directives).unwrap();
let directives = normalize_predicates2::run(&mut flat_resolver, directives).unwrap();
let directives = expand_operands::run(&mut flat_resolver, directives).unwrap();
let directives = normalize_basic_blocks::run(&mut flat_resolver, directives).unwrap();
let directives = super::run(&mut flat_resolver, directives).unwrap();
directives
.into_iter()
.filter_map(|s| match s {
Directive2::Method(m) => Some(m),
_ => None,
})
.collect::<Vec<_>>()
}
static CALL_WITH_MODE_PTX: &'static str = include_str!("call_with_mode.ptx");
#[test]
fn call_with_mode() {
let methods = compile_methods(CALL_WITH_MODE_PTX);
assert!(matches!(methods[0].body, None));
let method_1 = methods[1].body.as_ref().unwrap();
assert!(matches!(
&**method_1,
[
Statement::Label(..),
Statement::Variable(..),
Statement::Instruction(ast::Instruction::Add { .. }),
Statement::Instruction(ast::Instruction::Call { .. }),
Statement::Instruction(ast::Instruction::Bra { .. }),
Statement::Label(..),
// Dual prelude
Statement::SetMode(ModeRegister::Denormal {
f32: true,
f16f64: true
}),
Statement::SetMode(ModeRegister::Rounding {
f32: ast::RoundingMode::PositiveInf,
f16f64: ast::RoundingMode::NearestEven
}),
Statement::Instruction(ast::Instruction::Bra { .. }),
// Denormal prelude
Statement::Label(..),
Statement::SetMode(ModeRegister::Denormal {
f32: true,
f16f64: true
}),
Statement::Instruction(ast::Instruction::Bra { .. }),
// Rounding prelude
Statement::Label(..),
Statement::SetMode(ModeRegister::Rounding {
f32: ast::RoundingMode::PositiveInf,
f16f64: ast::RoundingMode::NearestEven
}),
Statement::Instruction(ast::Instruction::Bra { .. }),
Statement::Label(..),
Statement::Instruction(ast::Instruction::Add { .. }),
Statement::Instruction(ast::Instruction::Ret { .. }),
]
));
let [to_fn0] = calls(method_1);
let [_, dual_prelude, _, _, add] = labels(method_1);
let [post_call, post_prelude_dual, post_prelude_denormal, post_prelude_rounding] =
branches(method_1);
assert_eq!(methods[0].name, to_fn0);
assert_eq!(post_call, dual_prelude);
assert_eq!(post_prelude_dual, add);
assert_eq!(post_prelude_denormal, add);
assert_eq!(post_prelude_rounding, add);
let method_2 = methods[2].body.as_ref().unwrap();
assert!(matches!(
&**method_2,
[
Statement::Label(..),
Statement::Variable(..),
Statement::Variable(..),
Statement::Conditional(..),
Statement::Label(..),
Statement::Conditional(..),
Statement::Label(..),
Statement::Instruction(ast::Instruction::Bra { .. }),
Statement::Label(..),
// Dual prelude
Statement::SetMode(ModeRegister::Denormal {
f32: false,
f16f64: true
}),
Statement::SetMode(ModeRegister::Rounding {
f32: ast::RoundingMode::NegativeInf,
f16f64: ast::RoundingMode::NearestEven
}),
Statement::Instruction(ast::Instruction::Bra { .. }),
// Denormal prelude
Statement::Label(..),
Statement::SetMode(ModeRegister::Denormal {
f32: false,
f16f64: true
}),
Statement::Instruction(ast::Instruction::Bra { .. }),
// Rounding prelude
Statement::Label(..),
Statement::SetMode(ModeRegister::Rounding {
f32: ast::RoundingMode::NegativeInf,
f16f64: ast::RoundingMode::NearestEven
}),
Statement::Instruction(ast::Instruction::Bra { .. }),
Statement::Label(..),
Statement::Instruction(ast::Instruction::Add { .. }),
Statement::Instruction(ast::Instruction::Bra { .. }),
Statement::Label(..),
Statement::SetMode(ModeRegister::Denormal {
f32: false,
f16f64: true
}),
Statement::Instruction(ast::Instruction::Bra { .. }),
Statement::Label(..),
Statement::Instruction(ast::Instruction::Add { .. }),
Statement::Instruction(ast::Instruction::Bra { .. }),
Statement::Label(..),
Statement::Instruction(ast::Instruction::Ret { .. }),
]
));
let [(if_rm_true, if_rm_false), (if_rz_true, if_rz_false)] = conditionals(method_2);
let [_, conditional2, post_conditional2, prelude_dual, _, _, add1, add2_set_denormal, add2, ret] =
labels(method_2);
let [post_conditional2_jump, post_prelude_dual, post_prelude_denormal, post_prelude_rounding, post_add1, post_add2_set_denormal, post_add2] =
branches(method_2);
assert_eq!(if_rm_true, prelude_dual);
assert_eq!(if_rm_false, conditional2);
assert_eq!(if_rz_true, post_conditional2);
assert_eq!(if_rz_false, add2_set_denormal);
assert_eq!(post_conditional2_jump, prelude_dual);
assert_eq!(post_prelude_dual, add1);
assert_eq!(post_prelude_denormal, add1);
assert_eq!(post_prelude_rounding, add1);
assert_eq!(post_add1, ret);
assert_eq!(post_add2_set_denormal, add2);
assert_eq!(post_add2, ret);
}
fn branches<const N: usize>(
fn_: &Vec<Statement<ast::Instruction<SpirvWord>, SpirvWord>>,
) -> [SpirvWord; N] {
fn_.iter()
.filter_map(|s| match s {
Statement::Instruction(ast::Instruction::Bra {
arguments: ast::BraArgs { src },
}) => Some(*src),
_ => None,
})
.collect::<Vec<_>>()
.try_into()
.unwrap()
}
fn labels<const N: usize>(
fn_: &Vec<Statement<ast::Instruction<SpirvWord>, SpirvWord>>,
) -> [SpirvWord; N] {
fn_.iter()
.filter_map(
|s: &Statement<ptx_parser::Instruction<SpirvWord>, SpirvWord>| match s {
Statement::Label(label) => Some(*label),
_ => None,
},
)
.collect::<Vec<_>>()
.try_into()
.unwrap()
}
fn calls<const N: usize>(
fn_: &Vec<Statement<ast::Instruction<SpirvWord>, SpirvWord>>,
) -> [SpirvWord; N] {
fn_.iter()
.filter_map(|s| match s {
Statement::Instruction(ast::Instruction::Call {
arguments: ast::CallArgs { func, .. },
..
}) => Some(*func),
_ => None,
})
.collect::<Vec<_>>()
.try_into()
.unwrap()
}
fn conditionals<const N: usize>(
fn_: &Vec<Statement<ast::Instruction<SpirvWord>, SpirvWord>>,
) -> [(SpirvWord, SpirvWord); N] {
fn_.iter()
.filter_map(|s| match s {
Statement::Conditional(BrachCondition {
if_true, if_false, ..
}) => Some((*if_true, *if_false)),
_ => None,
})
.collect::<Vec<_>>()
.try_into()
.unwrap()
}

View File

@ -17,12 +17,15 @@ mod expand_operands;
mod fix_special_registers2; mod fix_special_registers2;
mod hoist_globals; mod hoist_globals;
mod insert_explicit_load_store; mod insert_explicit_load_store;
mod instruction_mode_to_global_mode;
mod insert_implicit_conversions2; mod insert_implicit_conversions2;
mod normalize_basic_blocks;
mod normalize_identifiers2; mod normalize_identifiers2;
mod normalize_predicates2; mod normalize_predicates2;
mod remove_unreachable_basic_blocks;
mod replace_instructions_with_function_calls; mod replace_instructions_with_function_calls;
mod resolve_function_pointers;
mod replace_known_functions; mod replace_known_functions;
mod resolve_function_pointers;
static ZLUDA_PTX_IMPL: &'static [u8] = include_bytes!("../../lib/zluda_ptx_impl.bc"); static ZLUDA_PTX_IMPL: &'static [u8] = include_bytes!("../../lib/zluda_ptx_impl.bc");
const ZLUDA_PTX_PREFIX: &'static str = "__zluda_ptx_impl_"; const ZLUDA_PTX_PREFIX: &'static str = "__zluda_ptx_impl_";
@ -43,12 +46,15 @@ pub fn to_llvm_module<'input>(ast: ast::Module<'input>) -> Result<Module, Transl
let mut scoped_resolver = ScopedResolver::new(&mut flat_resolver); let mut scoped_resolver = ScopedResolver::new(&mut flat_resolver);
let sreg_map = SpecialRegistersMap2::new(&mut scoped_resolver)?; let sreg_map = SpecialRegistersMap2::new(&mut scoped_resolver)?;
let directives = normalize_identifiers2::run(&mut scoped_resolver, ast.directives)?; let directives = normalize_identifiers2::run(&mut scoped_resolver, ast.directives)?;
let directives = replace_known_functions::run(&flat_resolver, directives); let directives = replace_known_functions::run(&mut flat_resolver, directives);
let directives = normalize_predicates2::run(&mut flat_resolver, directives)?; let directives = normalize_predicates2::run(&mut flat_resolver, directives)?;
let directives = resolve_function_pointers::run(directives)?; let directives = resolve_function_pointers::run(directives)?;
let directives: Vec<Directive2<'_, ptx_parser::Instruction<ptx_parser::ParsedOperand<SpirvWord>>, ptx_parser::ParsedOperand<SpirvWord>>> = fix_special_registers2::run(&mut flat_resolver, &sreg_map, directives)?; let directives = fix_special_registers2::run(&mut flat_resolver, &sreg_map, directives)?;
let directives = expand_operands::run(&mut flat_resolver, directives)?; let directives = expand_operands::run(&mut flat_resolver, directives)?;
let directives = deparamize_functions::run(&mut flat_resolver, directives)?; let directives = deparamize_functions::run(&mut flat_resolver, directives)?;
let directives = normalize_basic_blocks::run(&mut flat_resolver, directives)?;
let directives = remove_unreachable_basic_blocks::run(directives)?;
let directives = instruction_mode_to_global_mode::run(&mut flat_resolver, directives)?;
let directives = insert_explicit_load_store::run(&mut flat_resolver, directives)?; let directives = insert_explicit_load_store::run(&mut flat_resolver, directives)?;
let directives = insert_implicit_conversions2::run(&mut flat_resolver, directives)?; let directives = insert_implicit_conversions2::run(&mut flat_resolver, directives)?;
let directives = replace_instructions_with_function_calls::run(&mut flat_resolver, directives)?; let directives = replace_instructions_with_function_calls::run(&mut flat_resolver, directives)?;
@ -61,7 +67,7 @@ pub fn to_llvm_module<'input>(ast: ast::Module<'input>) -> Result<Module, Transl
} }
pub struct Module { pub struct Module {
pub llvm_ir: emit_llvm::MemoryBuffer, pub llvm_ir: emit_llvm::Module,
pub kernel_info: HashMap<String, KernelInfo>, pub kernel_info: HashMap<String, KernelInfo>,
} }
@ -195,6 +201,20 @@ enum Statement<I, P: ast::Operand> {
FunctionPointer(FunctionPointerDetails), FunctionPointer(FunctionPointerDetails),
VectorRead(VectorRead), VectorRead(VectorRead),
VectorWrite(VectorWrite), VectorWrite(VectorWrite),
SetMode(ModeRegister),
}
#[derive(Eq, PartialEq, Clone, Copy)]
#[cfg_attr(test, derive(Debug))]
enum ModeRegister {
Denormal {
f32: bool,
f16f64: bool,
},
Rounding {
f32: ast::RoundingMode,
f16f64: ast::RoundingMode,
},
} }
impl<T: ast::Operand<Ident = SpirvWord>> Statement<ast::Instruction<T>, T> { impl<T: ast::Operand<Ident = SpirvWord>> Statement<ast::Instruction<T>, T> {
@ -467,6 +487,7 @@ impl<T: ast::Operand<Ident = SpirvWord>> Statement<ast::Instruction<T>, T> {
let src = visitor.visit_ident(src, None, false, false)?; let src = visitor.visit_ident(src, None, false, false)?;
Statement::FunctionPointer(FunctionPointerDetails { dst, src }) Statement::FunctionPointer(FunctionPointerDetails { dst, src })
} }
Statement::SetMode(mode_register) => Statement::SetMode(mode_register),
}) })
} }
} }
@ -525,7 +546,7 @@ struct FunctionPointerDetails {
src: SpirvWord, src: SpirvWord,
} }
#[derive(Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] #[derive(Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Debug)]
pub struct SpirvWord(u32); pub struct SpirvWord(u32);
impl From<u32> for SpirvWord { impl From<u32> for SpirvWord {
@ -557,22 +578,27 @@ type NormalizedStatement = Statement<
ast::ParsedOperand<SpirvWord>, ast::ParsedOperand<SpirvWord>,
>; >;
enum Directive2<'input, Instruction, Operand: ast::Operand> { enum Directive2<Instruction, Operand: ast::Operand> {
Variable(ast::LinkingDirective, ast::Variable<SpirvWord>), Variable(ast::LinkingDirective, ast::Variable<SpirvWord>),
Method(Function2<'input, Instruction, Operand>), Method(Function2<Instruction, Operand>),
} }
struct Function2<'input, Instruction, Operand: ast::Operand> { struct Function2<Instruction, Operand: ast::Operand> {
pub func_decl: ast::MethodDeclaration<'input, SpirvWord>, pub return_arguments: Vec<ast::Variable<Operand::Ident>>,
pub globals: Vec<ast::Variable<SpirvWord>>, pub name: Operand::Ident,
pub input_arguments: Vec<ast::Variable<Operand::Ident>>,
pub body: Option<Vec<Statement<Instruction, Operand>>>, pub body: Option<Vec<Statement<Instruction, Operand>>>,
is_kernel: bool,
import_as: Option<String>, import_as: Option<String>,
tuning: Vec<ast::TuningDirective>, tuning: Vec<ast::TuningDirective>,
linkage: ast::LinkingDirective, linkage: ast::LinkingDirective,
flush_to_zero_f32: bool,
flush_to_zero_f16f64: bool,
rounding_mode_f32: ast::RoundingMode,
rounding_mode_f16f64: ast::RoundingMode,
} }
type NormalizedDirective2<'input> = Directive2< type NormalizedDirective2 = Directive2<
'input,
( (
Option<ast::PredAt<SpirvWord>>, Option<ast::PredAt<SpirvWord>>,
ast::Instruction<ast::ParsedOperand<SpirvWord>>, ast::Instruction<ast::ParsedOperand<SpirvWord>>,
@ -580,8 +606,7 @@ type NormalizedDirective2<'input> = Directive2<
ast::ParsedOperand<SpirvWord>, ast::ParsedOperand<SpirvWord>,
>; >;
type NormalizedFunction2<'input> = Function2< type NormalizedFunction2 = Function2<
'input,
( (
Option<ast::PredAt<SpirvWord>>, Option<ast::PredAt<SpirvWord>>,
ast::Instruction<ast::ParsedOperand<SpirvWord>>, ast::Instruction<ast::ParsedOperand<SpirvWord>>,
@ -589,17 +614,11 @@ type NormalizedFunction2<'input> = Function2<
ast::ParsedOperand<SpirvWord>, ast::ParsedOperand<SpirvWord>,
>; >;
type UnconditionalDirective<'input> = Directive2< type UnconditionalDirective =
'input, Directive2<ast::Instruction<ast::ParsedOperand<SpirvWord>>, ast::ParsedOperand<SpirvWord>>;
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
ast::ParsedOperand<SpirvWord>,
>;
type UnconditionalFunction<'input> = Function2< type UnconditionalFunction =
'input, Function2<ast::Instruction<ast::ParsedOperand<SpirvWord>>, ast::ParsedOperand<SpirvWord>>;
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
ast::ParsedOperand<SpirvWord>,
>;
struct GlobalStringIdentResolver2<'input> { struct GlobalStringIdentResolver2<'input> {
pub(crate) current_id: SpirvWord, pub(crate) current_id: SpirvWord,
@ -805,47 +824,45 @@ impl SpecialRegistersMap2 {
self.id_to_reg.get(&id).copied() self.id_to_reg.get(&id).copied()
} }
fn generate_declarations<'a, 'input>( fn len() -> usize {
PtxSpecialRegister::iter().len()
}
fn foreach_declaration<'a, 'input>(
resolver: &'a mut GlobalStringIdentResolver2<'input>, resolver: &'a mut GlobalStringIdentResolver2<'input>,
) -> impl ExactSizeIterator< mut fn_: impl FnMut(
Item = (
PtxSpecialRegister, PtxSpecialRegister,
ast::MethodDeclaration<'input, SpirvWord>, (
Vec<ast::Variable<SpirvWord>>,
SpirvWord,
Vec<ast::Variable<SpirvWord>>,
),
), ),
> + 'a { ) {
PtxSpecialRegister::iter().map(|sreg| { for sreg in PtxSpecialRegister::iter() {
let external_fn_name = [ZLUDA_PTX_PREFIX, sreg.get_unprefixed_function_name()].concat(); let external_fn_name = [ZLUDA_PTX_PREFIX, sreg.get_unprefixed_function_name()].concat();
let name = let name = resolver.register_named(Cow::Owned(external_fn_name), None);
ast::MethodName::Func(resolver.register_named(Cow::Owned(external_fn_name), None));
let return_type = sreg.get_function_return_type(); let return_type = sreg.get_function_return_type();
let input_type = sreg.get_function_input_type(); let input_type = sreg.get_function_input_type();
( let return_arguments = vec![ast::Variable {
sreg, align: None,
ast::MethodDeclaration { v_type: return_type.into(),
return_arguments: vec![ast::Variable { state_space: ast::StateSpace::Reg,
align: None, name: resolver.register_unnamed(Some((return_type.into(), ast::StateSpace::Reg))),
v_type: return_type.into(), array_init: Vec::new(),
state_space: ast::StateSpace::Reg, }];
name: resolver let input_arguments = input_type
.register_unnamed(Some((return_type.into(), ast::StateSpace::Reg))), .into_iter()
array_init: Vec::new(), .map(|type_| ast::Variable {
}], align: None,
name: name, v_type: type_.into(),
input_arguments: input_type state_space: ast::StateSpace::Reg,
.into_iter() name: resolver.register_unnamed(Some((type_.into(), ast::StateSpace::Reg))),
.map(|type_| ast::Variable { array_init: Vec::new(),
align: None, })
v_type: type_.into(), .collect::<Vec<_>>();
state_space: ast::StateSpace::Reg, fn_(sreg, (return_arguments, name, input_arguments));
name: resolver }
.register_unnamed(Some((type_.into(), ast::StateSpace::Reg))),
array_init: Vec::new(),
})
.collect::<Vec<_>>(),
shared_mem: None,
},
)
})
} }
} }

View File

@ -0,0 +1,134 @@
use super::*;
// This pass normalizes ptx modules in two ways that makes mode computation pass
// and code emissions passes much simpler:
// * Inserts label at the start of every function
// This makes control flow graph simpler in mode computation block: we can
// represent kernels as separate nodes with its own separate entry/exit mode
// * Inserts label at the start of every basic block
// * Insert explicit jumps before labels
// * Non-.entry methods get a single `ret;` exit point - this is because mode computation
// logic requires it. Control flow graph constructed by mode computation
// models function calls as jumps into and then from another function.
// If this cfg allowed multiple return basic blocks then there would be cases
// where we want to insert mode setting instruction along the edge between
// `ret;` and bb in the caller. This is only possible if there's a single
// edge between from function `ret;` and caller
pub(crate) fn run(
flat_resolver: &mut GlobalStringIdentResolver2<'_>,
mut directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
for directive in directives.iter_mut() {
let (body_ref, is_kernel) = match directive {
Directive2::Method(Function2 {
body: Some(body), is_kernel, ..
}) => (body, *is_kernel),
_ => continue,
};
let body = std::mem::replace(body_ref, Vec::new());
let mut result = Vec::with_capacity(body.len());
let mut previous_instruction_was_terminator = TerminatorKind::Not;
let mut body_iterator = body.into_iter();
let mut return_statements = Vec::new();
match body_iterator.next() {
Some(Statement::Label(_)) => {}
Some(statement) => {
result.push(Statement::Label(flat_resolver.register_unnamed(None)));
result.push(statement);
}
None => {}
}
for statement in body_iterator {
match previous_instruction_was_terminator {
TerminatorKind::Not => match statement {
Statement::Label(label) => {
result.push(Statement::Instruction(ast::Instruction::Bra {
arguments: ast::BraArgs { src: label },
}))
}
_ => {}
},
TerminatorKind::Real => {
if !matches!(statement, Statement::Label(..)) {
result.push(Statement::Label(flat_resolver.register_unnamed(None)));
}
}
TerminatorKind::Fake => match statement {
// If there's a label after a call just reuse it
Statement::Label(label) => {
result.push(Statement::Instruction(ast::Instruction::Bra {
arguments: ast::BraArgs { src: label },
}))
}
_ => {
let label = flat_resolver.register_unnamed(None);
result.push(Statement::Instruction(ast::Instruction::Bra {
arguments: ast::BraArgs { src: label },
}));
result.push(Statement::Label(label));
}
},
}
match statement {
Statement::RetValue(..) => {
return Err(error_unreachable());
}
Statement::Instruction(ast::Instruction::Ret { .. }) => {
if !is_kernel {
return_statements.push(result.len());
}
}
_ => {}
}
previous_instruction_was_terminator = is_block_terminator(&statement);
result.push(statement);
}
convert_from_multiple_returns_to_single_return(
flat_resolver,
&mut result,
return_statements,
)?;
*body_ref = result;
}
Ok(directives)
}
enum TerminatorKind {
Not,
Real,
Fake,
}
fn convert_from_multiple_returns_to_single_return(
flat_resolver: &mut GlobalStringIdentResolver2<'_>,
result: &mut Vec<Statement<ptx_parser::Instruction<SpirvWord>, SpirvWord>>,
return_statements: Vec<usize>,
) -> Result<(), TranslateError> {
Ok(if return_statements.len() > 1 {
let ret_bb = flat_resolver.register_unnamed(None);
result.push(Statement::Label(ret_bb));
result.push(Statement::Instruction(ast::Instruction::Ret {
data: ast::RetData { uniform: false },
}));
for ret_index in return_statements {
let statement = result.get_mut(ret_index).ok_or_else(error_unreachable)?;
*statement = Statement::Instruction(ast::Instruction::Bra {
arguments: ast::BraArgs { src: ret_bb },
});
}
})
}
fn is_block_terminator(
statement: &Statement<ast::Instruction<SpirvWord>, SpirvWord>,
) -> TerminatorKind {
match statement {
Statement::Conditional(..)
| Statement::Instruction(ast::Instruction::Bra { .. })
// Normally call is not a terminator, but we treat it as such because it
// makes the "instruction modes to global modes" pass possible
| Statement::Instruction(ast::Instruction::Ret { .. }) => TerminatorKind::Real,
Statement::Instruction(ast::Instruction::Call { .. }) => TerminatorKind::Fake,
_ => TerminatorKind::Not,
}
}

View File

@ -4,7 +4,7 @@ use ptx_parser as ast;
pub(crate) fn run<'input, 'b>( pub(crate) fn run<'input, 'b>(
resolver: &mut ScopedResolver<'input, 'b>, resolver: &mut ScopedResolver<'input, 'b>,
directives: Vec<ast::Directive<'input, ast::ParsedOperand<&'input str>>>, directives: Vec<ast::Directive<'input, ast::ParsedOperand<&'input str>>>,
) -> Result<Vec<NormalizedDirective2<'input>>, TranslateError> { ) -> Result<Vec<NormalizedDirective2>, TranslateError> {
resolver.start_scope(); resolver.start_scope();
let result = directives let result = directives
.into_iter() .into_iter()
@ -17,7 +17,7 @@ pub(crate) fn run<'input, 'b>(
fn run_directive<'input, 'b>( fn run_directive<'input, 'b>(
resolver: &mut ScopedResolver<'input, 'b>, resolver: &mut ScopedResolver<'input, 'b>,
directive: ast::Directive<'input, ast::ParsedOperand<&'input str>>, directive: ast::Directive<'input, ast::ParsedOperand<&'input str>>,
) -> Result<NormalizedDirective2<'input>, TranslateError> { ) -> Result<NormalizedDirective2, TranslateError> {
Ok(match directive { Ok(match directive {
ast::Directive::Variable(linking, var) => { ast::Directive::Variable(linking, var) => {
NormalizedDirective2::Variable(linking, run_variable(resolver, var)?) NormalizedDirective2::Variable(linking, run_variable(resolver, var)?)
@ -32,15 +32,11 @@ fn run_method<'input, 'b>(
resolver: &mut ScopedResolver<'input, 'b>, resolver: &mut ScopedResolver<'input, 'b>,
linkage: ast::LinkingDirective, linkage: ast::LinkingDirective,
method: ast::Function<'input, &'input str, ast::Statement<ast::ParsedOperand<&'input str>>>, method: ast::Function<'input, &'input str, ast::Statement<ast::ParsedOperand<&'input str>>>,
) -> Result<NormalizedFunction2<'input>, TranslateError> { ) -> Result<NormalizedFunction2, TranslateError> {
let name = match method.func_directive.name { let is_kernel = method.func_directive.name.is_kernel();
ast::MethodName::Kernel(name) => ast::MethodName::Kernel(name), let name = resolver.add_or_get_in_current_scope_untyped(method.func_directive.name.text())?;
ast::MethodName::Func(text) => {
ast::MethodName::Func(resolver.add_or_get_in_current_scope_untyped(text)?)
}
};
resolver.start_scope(); resolver.start_scope();
let func_decl = run_function_decl(resolver, method.func_directive, name)?; let (return_arguments, input_arguments) = run_function_decl(resolver, method.func_directive)?;
let body = method let body = method
.body .body
.map(|statements| { .map(|statements| {
@ -51,20 +47,25 @@ fn run_method<'input, 'b>(
.transpose()?; .transpose()?;
resolver.end_scope(); resolver.end_scope();
Ok(Function2 { Ok(Function2 {
func_decl, return_arguments,
globals: Vec::new(), name,
input_arguments,
body, body,
import_as: None, import_as: None,
tuning: method.tuning,
linkage, linkage,
is_kernel,
tuning: method.tuning,
flush_to_zero_f32: false,
flush_to_zero_f16f64: false,
rounding_mode_f32: ptx_parser::RoundingMode::NearestEven,
rounding_mode_f16f64: ptx_parser::RoundingMode::NearestEven,
}) })
} }
fn run_function_decl<'input, 'b>( fn run_function_decl<'input, 'b>(
resolver: &mut ScopedResolver<'input, 'b>, resolver: &mut ScopedResolver<'input, 'b>,
func_directive: ast::MethodDeclaration<'input, &'input str>, func_directive: ast::MethodDeclaration<'input, &'input str>,
name: ast::MethodName<'input, SpirvWord>, ) -> Result<(Vec<ast::Variable<SpirvWord>>, Vec<ast::Variable<SpirvWord>>), TranslateError> {
) -> Result<ast::MethodDeclaration<'input, SpirvWord>, TranslateError> {
assert!(func_directive.shared_mem.is_none()); assert!(func_directive.shared_mem.is_none());
let return_arguments = func_directive let return_arguments = func_directive
.return_arguments .return_arguments
@ -76,12 +77,7 @@ fn run_function_decl<'input, 'b>(
.into_iter() .into_iter()
.map(|var| run_variable(resolver, var)) .map(|var| run_variable(resolver, var))
.collect::<Result<Vec<_>, _>>()?; .collect::<Result<Vec<_>, _>>()?;
Ok(ast::MethodDeclaration { Ok((return_arguments, input_arguments))
return_arguments,
name,
input_arguments,
shared_mem: None,
})
} }
fn run_variable<'input, 'b>( fn run_variable<'input, 'b>(

View File

@ -3,8 +3,8 @@ use ptx_parser as ast;
pub(crate) fn run<'input>( pub(crate) fn run<'input>(
resolver: &mut GlobalStringIdentResolver2<'input>, resolver: &mut GlobalStringIdentResolver2<'input>,
directives: Vec<NormalizedDirective2<'input>>, directives: Vec<NormalizedDirective2>,
) -> Result<Vec<UnconditionalDirective<'input>>, TranslateError> { ) -> Result<Vec<UnconditionalDirective>, TranslateError> {
directives directives
.into_iter() .into_iter()
.map(|directive| run_directive(resolver, directive)) .map(|directive| run_directive(resolver, directive))
@ -13,8 +13,8 @@ pub(crate) fn run<'input>(
fn run_directive<'input>( fn run_directive<'input>(
resolver: &mut GlobalStringIdentResolver2<'input>, resolver: &mut GlobalStringIdentResolver2<'input>,
directive: NormalizedDirective2<'input>, directive: NormalizedDirective2,
) -> Result<UnconditionalDirective<'input>, TranslateError> { ) -> Result<UnconditionalDirective, TranslateError> {
Ok(match directive { Ok(match directive {
Directive2::Variable(linking, var) => Directive2::Variable(linking, var), Directive2::Variable(linking, var) => Directive2::Variable(linking, var),
Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?), Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?),
@ -23,8 +23,8 @@ fn run_directive<'input>(
fn run_method<'input>( fn run_method<'input>(
resolver: &mut GlobalStringIdentResolver2<'input>, resolver: &mut GlobalStringIdentResolver2<'input>,
method: NormalizedFunction2<'input>, method: NormalizedFunction2,
) -> Result<UnconditionalFunction<'input>, TranslateError> { ) -> Result<UnconditionalFunction, TranslateError> {
let body = method let body = method
.body .body
.map(|statements| { .map(|statements| {
@ -36,12 +36,18 @@ fn run_method<'input>(
}) })
.transpose()?; .transpose()?;
Ok(Function2 { Ok(Function2 {
func_decl: method.func_decl,
globals: method.globals,
body, body,
return_arguments: method.return_arguments,
name: method.name,
input_arguments: method.input_arguments,
import_as: method.import_as, import_as: method.import_as,
tuning: method.tuning, tuning: method.tuning,
linkage: method.linkage, linkage: method.linkage,
is_kernel: method.is_kernel,
flush_to_zero_f32: method.flush_to_zero_f32,
flush_to_zero_f16f64: method.flush_to_zero_f16f64,
rounding_mode_f32: method.rounding_mode_f32,
rounding_mode_f16f64: method.rounding_mode_f16f64,
}) })
} }

View File

@ -0,0 +1,122 @@
use super::*;
use petgraph::{
graph::NodeIndex,
visit::{Bfs, VisitMap},
Graph,
};
use rustc_hash::FxHashSet;
pub(crate) fn run(
mut directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
let mut reachable_funcs = FxHashSet::default();
for directive in directives.iter_mut() {
match directive {
Directive2::Method(Function2 {
body: Some(body), ..
}) => {
let old_body = std::mem::replace(body, Vec::new());
let mut cfg = ControlFlowGraph::new();
let mut old_body_iter = old_body.iter();
let mut current_bb = match old_body_iter.next() {
Some(Statement::Label(label)) => cfg.add_or_get_node(*label),
_ => return Err(error_unreachable()),
};
let first_bb = current_bb;
for statement in old_body_iter {
match statement {
Statement::Label(label) => {
current_bb = cfg.add_or_get_node(*label);
}
Statement::Conditional(branch) => {
cfg.add_branch(current_bb, branch.if_true);
cfg.add_branch(current_bb, branch.if_false);
}
Statement::Instruction(ast::Instruction::Bra {
arguments: ast::BraArgs { src },
}) => {
cfg.add_branch(current_bb, *src);
}
Statement::FunctionPointer(FunctionPointerDetails {
src: _func, ..
}) => {
return Err(error_todo());
}
Statement::Instruction(ast::Instruction::Call {
arguments: ast::CallArgs { func, .. },
..
}) => {
reachable_funcs.insert(*func);
}
_ => {}
}
}
let mut bfs = Bfs::new(&cfg.graph, first_bb);
while let Some(_) = bfs.next(&cfg.graph) {}
let mut visited = true;
*body = try_filter_to_vec(old_body.into_iter(), |statement| {
match statement {
Statement::Label(label) => {
visited = bfs
.discovered
.is_visited(cfg.nodes.get(label).ok_or_else(error_unreachable)?);
}
_ => {}
}
Ok(visited)
})?;
}
_ => {}
}
}
Ok(directives
.into_iter()
.filter(|directive| match directive {
Directive2::Variable(..) => true,
Directive2::Method(Function2 {
name, is_kernel, ..
}) => *is_kernel || reachable_funcs.contains(name),
})
.collect::<Vec<_>>())
}
fn try_filter_to_vec<T, E>(
mut iter: impl ExactSizeIterator<Item = T>,
mut filter: impl FnMut(&T) -> Result<bool, E>,
) -> Result<Vec<T>, E> {
iter.try_fold(Vec::with_capacity(iter.len()), |mut vec, item| {
match filter(&item) {
Ok(true) => vec.push(item),
Ok(false) => {}
Err(err) => return Err(err),
}
Ok(vec)
})
}
struct ControlFlowGraph {
graph: Graph<SpirvWord, ()>,
nodes: FxHashMap<SpirvWord, NodeIndex>,
}
impl ControlFlowGraph {
fn new() -> Self {
Self {
graph: Graph::new(),
nodes: FxHashMap::default(),
}
}
fn add_or_get_node(&mut self, id: SpirvWord) -> NodeIndex {
*self
.nodes
.entry(id)
.or_insert_with(|| self.graph.add_node(id))
}
fn add_branch(&mut self, from: NodeIndex, to: SpirvWord) -> NodeIndex {
let to = self.add_or_get_node(to);
self.graph.add_edge(from, to, ());
to
}
}

View File

@ -2,8 +2,8 @@ use super::*;
pub(super) fn run<'input>( pub(super) fn run<'input>(
resolver: &mut GlobalStringIdentResolver2<'input>, resolver: &mut GlobalStringIdentResolver2<'input>,
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> { ) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
let mut fn_declarations = FxHashMap::default(); let mut fn_declarations = FxHashMap::default();
let remapped_directives = directives let remapped_directives = directives
.into_iter() .into_iter()
@ -13,17 +13,18 @@ pub(super) fn run<'input>(
.into_iter() .into_iter()
.map(|(_, (return_arguments, name, input_arguments))| { .map(|(_, (return_arguments, name, input_arguments))| {
Directive2::Method(Function2 { Directive2::Method(Function2 {
func_decl: ast::MethodDeclaration { return_arguments,
return_arguments, name: name,
name: ast::MethodName::Func(name), input_arguments,
input_arguments,
shared_mem: None,
},
globals: Vec::new(),
body: None, body: None,
import_as: None, import_as: None,
tuning: Vec::new(), tuning: Vec::new(),
linkage: ast::LinkingDirective::EXTERN, linkage: ast::LinkingDirective::EXTERN,
is_kernel: false,
flush_to_zero_f32: false,
flush_to_zero_f16f64: false,
rounding_mode_f32: ptx_parser::RoundingMode::NearestEven,
rounding_mode_f16f64: ptx_parser::RoundingMode::NearestEven,
}) })
}) })
.collect::<Vec<_>>(); .collect::<Vec<_>>();
@ -41,8 +42,8 @@ fn run_directive<'input>(
Vec<ast::Variable<SpirvWord>>, Vec<ast::Variable<SpirvWord>>,
), ),
>, >,
directive: Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, directive: Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> { ) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
Ok(match directive { Ok(match directive {
var @ Directive2::Variable(..) => var, var @ Directive2::Variable(..) => var,
Directive2::Method(mut method) => { Directive2::Method(mut method) => {

View File

@ -1,14 +1,15 @@
use std::borrow::Cow;
use super::{GlobalStringIdentResolver2, NormalizedDirective2, SpirvWord}; use super::{GlobalStringIdentResolver2, NormalizedDirective2, SpirvWord};
pub(crate) fn run<'input>( pub(crate) fn run<'input>(
resolver: &GlobalStringIdentResolver2<'input>, resolver: &mut GlobalStringIdentResolver2<'input>,
mut directives: Vec<NormalizedDirective2<'input>>, mut directives: Vec<NormalizedDirective2>,
) -> Vec<NormalizedDirective2<'input>> { ) -> Vec<NormalizedDirective2> {
for directive in directives.iter_mut() { for directive in directives.iter_mut() {
match directive { match directive {
NormalizedDirective2::Method(func) => { NormalizedDirective2::Method(func) => {
func.import_as = replace_with_ptx_impl(resolver, func.name);
replace_with_ptx_impl(resolver, &func.func_decl.name, func.import_as.take());
} }
_ => {} _ => {}
} }
@ -17,22 +18,16 @@ pub(crate) fn run<'input>(
} }
fn replace_with_ptx_impl<'input>( fn replace_with_ptx_impl<'input>(
resolver: &GlobalStringIdentResolver2<'input>, resolver: &mut GlobalStringIdentResolver2<'input>,
fn_name: &ptx_parser::MethodName<'input, SpirvWord>, fn_name: SpirvWord,
name: Option<String>, ) {
) -> Option<String> {
let known_names = ["__assertfail"]; let known_names = ["__assertfail"];
match name { if let Some(super::IdentEntry {
Some(name) if known_names.contains(&&*name) => Some(format!("__zluda_ptx_impl_{}", name)), name: Some(name), ..
Some(name) => Some(name), }) = resolver.ident_map.get_mut(&fn_name)
None => match fn_name { {
ptx_parser::MethodName::Func(name) => match resolver.ident_map.get(name) { if known_names.contains(&&**name) {
Some(super::IdentEntry { *name = Cow::Owned(format!("__zluda_ptx_impl_{}", name));
name: Some(name), .. }
}) => Some(format!("__zluda_ptx_impl_{}", name)),
_ => None,
},
ptx_parser::MethodName::Kernel(..) => None,
},
} }
} }

View File

@ -3,8 +3,8 @@ use ptx_parser as ast;
use rustc_hash::FxHashSet; use rustc_hash::FxHashSet;
pub(crate) fn run<'input>( pub(crate) fn run<'input>(
directives: Vec<UnconditionalDirective<'input>>, directives: Vec<UnconditionalDirective>,
) -> Result<Vec<UnconditionalDirective<'input>>, TranslateError> { ) -> Result<Vec<UnconditionalDirective>, TranslateError> {
let mut functions = FxHashSet::default(); let mut functions = FxHashSet::default();
directives directives
.into_iter() .into_iter()
@ -14,19 +14,13 @@ pub(crate) fn run<'input>(
fn run_directive<'input>( fn run_directive<'input>(
functions: &mut FxHashSet<SpirvWord>, functions: &mut FxHashSet<SpirvWord>,
directive: UnconditionalDirective<'input>, directive: UnconditionalDirective,
) -> Result<UnconditionalDirective<'input>, TranslateError> { ) -> Result<UnconditionalDirective, TranslateError> {
Ok(match directive { Ok(match directive {
var @ Directive2::Variable(..) => var, var @ Directive2::Variable(..) => var,
Directive2::Method(method) => { Directive2::Method(method) => {
{ if !method.is_kernel {
let func_decl = &method.func_decl; functions.insert(method.name);
match func_decl.name {
ptx_parser::MethodName::Kernel(_) => {}
ptx_parser::MethodName::Func(name) => {
functions.insert(name);
}
}
} }
Directive2::Method(run_method(functions, method)?) Directive2::Method(run_method(functions, method)?)
} }
@ -35,8 +29,8 @@ fn run_directive<'input>(
fn run_method<'input>( fn run_method<'input>(
functions: &mut FxHashSet<SpirvWord>, functions: &mut FxHashSet<SpirvWord>,
method: UnconditionalFunction<'input>, method: UnconditionalFunction,
) -> Result<UnconditionalFunction<'input>, TranslateError> { ) -> Result<UnconditionalFunction, TranslateError> {
let body = method let body = method
.body .body
.map(|statements| { .map(|statements| {
@ -46,14 +40,7 @@ fn run_method<'input>(
.collect::<Result<Vec<_>, _>>() .collect::<Result<Vec<_>, _>>()
}) })
.transpose()?; .transpose()?;
Ok(Function2 { Ok(Function2 { body, ..method })
func_decl: method.func_decl,
globals: method.globals,
body,
import_as: method.import_as,
tuning: method.tuning,
linkage: method.linkage,
})
} }
fn run_statement<'input>( fn run_statement<'input>(

View File

@ -0,0 +1,24 @@
declare i32 @__zluda_ptx_impl_activemask() #0
define amdgpu_kernel void @activemask(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #1 {
%"31" = alloca i64, align 8, addrspace(5)
%"32" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"28"
"28": ; preds = %1
%"33" = load i64, ptr addrspace(4) %"30", align 4
store i64 %"33", ptr addrspace(5) %"31", align 4
%"34" = call i32 @__zluda_ptx_impl_activemask()
store i32 %"34", ptr addrspace(5) %"32", align 4
%"35" = load i64, ptr addrspace(5) %"31", align 4
%"36" = load i32, ptr addrspace(5) %"32", align 4
%"37" = inttoptr i64 %"35" to ptr
store i32 %"36", ptr %"37", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

30
ptx/src/test/ll/add.ll Normal file
View File

@ -0,0 +1,30 @@
define amdgpu_kernel void @add(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
%"34" = alloca i64, align 8, addrspace(5)
%"35" = alloca i64, align 8, addrspace(5)
%"36" = alloca i64, align 8, addrspace(5)
%"37" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"31"
"31": ; preds = %1
%"38" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"38", ptr addrspace(5) %"34", align 4
%"39" = load i64, ptr addrspace(4) %"33", align 4
store i64 %"39", ptr addrspace(5) %"35", align 4
%"41" = load i64, ptr addrspace(5) %"34", align 4
%"46" = inttoptr i64 %"41" to ptr
%"40" = load i64, ptr %"46", align 4
store i64 %"40", ptr addrspace(5) %"36", align 4
%"43" = load i64, ptr addrspace(5) %"36", align 4
%"42" = add i64 %"43", 1
store i64 %"42", ptr addrspace(5) %"37", align 4
%"44" = load i64, ptr addrspace(5) %"35", align 4
%"45" = load i64, ptr addrspace(5) %"37", align 4
%"47" = inttoptr i64 %"44" to ptr
store i64 %"45", ptr %"47", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -0,0 +1,52 @@
define amdgpu_kernel void @add_ftz(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
%"39" = alloca i64, align 8, addrspace(5)
%"40" = alloca i64, align 8, addrspace(5)
%"41" = alloca float, align 4, addrspace(5)
%"42" = alloca float, align 4, addrspace(5)
%"43" = alloca float, align 4, addrspace(5)
%"44" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"36"
"36": ; preds = %1
%"45" = load i64, ptr addrspace(4) %"37", align 4
store i64 %"45", ptr addrspace(5) %"39", align 4
%"46" = load i64, ptr addrspace(4) %"38", align 4
store i64 %"46", ptr addrspace(5) %"40", align 4
%"48" = load i64, ptr addrspace(5) %"39", align 4
%"61" = inttoptr i64 %"48" to ptr
%"47" = load float, ptr %"61", align 4
store float %"47", ptr addrspace(5) %"41", align 4
%"49" = load i64, ptr addrspace(5) %"39", align 4
%"62" = inttoptr i64 %"49" to ptr
%"33" = getelementptr inbounds i8, ptr %"62", i64 4
%"50" = load float, ptr %"33", align 4
store float %"50", ptr addrspace(5) %"42", align 4
%"52" = load float, ptr addrspace(5) %"41", align 4
%"53" = load float, ptr addrspace(5) %"42", align 4
%"51" = fadd float %"52", %"53"
store float %"51", ptr addrspace(5) %"43", align 4
call void @llvm.amdgcn.s.setreg(i32 6401, i32 3)
%"55" = load float, ptr addrspace(5) %"41", align 4
%"56" = load float, ptr addrspace(5) %"42", align 4
%"54" = fadd float %"55", %"56"
store float %"54", ptr addrspace(5) %"44", align 4
%"57" = load i64, ptr addrspace(5) %"40", align 4
%"58" = load float, ptr addrspace(5) %"43", align 4
%"63" = inttoptr i64 %"57" to ptr
store float %"58", ptr %"63", align 4
%"59" = load i64, ptr addrspace(5) %"40", align 4
%"64" = inttoptr i64 %"59" to ptr
%"35" = getelementptr inbounds i8, ptr %"64", i64 4
%"60" = load float, ptr addrspace(5) %"44", align 4
store float %"60", ptr %"35", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind willreturn
declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind willreturn }

View File

@ -0,0 +1,30 @@
define amdgpu_kernel void @add_non_coherent(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
%"34" = alloca i64, align 8, addrspace(5)
%"35" = alloca i64, align 8, addrspace(5)
%"36" = alloca i64, align 8, addrspace(5)
%"37" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"31"
"31": ; preds = %1
%"38" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"38", ptr addrspace(5) %"34", align 4
%"39" = load i64, ptr addrspace(4) %"33", align 4
store i64 %"39", ptr addrspace(5) %"35", align 4
%"41" = load i64, ptr addrspace(5) %"34", align 4
%"46" = inttoptr i64 %"41" to ptr addrspace(1)
%"40" = load i64, ptr addrspace(1) %"46", align 4
store i64 %"40", ptr addrspace(5) %"36", align 4
%"43" = load i64, ptr addrspace(5) %"36", align 4
%"42" = add i64 %"43", 1
store i64 %"42", ptr addrspace(5) %"37", align 4
%"44" = load i64, ptr addrspace(5) %"35", align 4
%"45" = load i64, ptr addrspace(5) %"37", align 4
%"47" = inttoptr i64 %"44" to ptr addrspace(1)
store i64 %"45", ptr addrspace(1) %"47", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -0,0 +1,30 @@
define amdgpu_kernel void @add_tuning(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
%"34" = alloca i64, align 8, addrspace(5)
%"35" = alloca i64, align 8, addrspace(5)
%"36" = alloca i64, align 8, addrspace(5)
%"37" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"31"
"31": ; preds = %1
%"38" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"38", ptr addrspace(5) %"34", align 4
%"39" = load i64, ptr addrspace(4) %"33", align 4
store i64 %"39", ptr addrspace(5) %"35", align 4
%"41" = load i64, ptr addrspace(5) %"34", align 4
%"46" = inttoptr i64 %"41" to ptr
%"40" = load i64, ptr %"46", align 4
store i64 %"40", ptr addrspace(5) %"36", align 4
%"43" = load i64, ptr addrspace(5) %"36", align 4
%"42" = add i64 %"43", 1
store i64 %"42", ptr addrspace(5) %"37", align 4
%"44" = load i64, ptr addrspace(5) %"35", align 4
%"45" = load i64, ptr addrspace(5) %"37", align 4
%"47" = inttoptr i64 %"44" to ptr
store i64 %"45", ptr %"47", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

36
ptx/src/test/ll/and.ll Normal file
View File

@ -0,0 +1,36 @@
define amdgpu_kernel void @and(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
%"35" = alloca i64, align 8, addrspace(5)
%"36" = alloca i64, align 8, addrspace(5)
%"37" = alloca i32, align 4, addrspace(5)
%"38" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"32"
"32": ; preds = %1
%"39" = load i64, ptr addrspace(4) %"33", align 4
store i64 %"39", ptr addrspace(5) %"35", align 4
%"40" = load i64, ptr addrspace(4) %"34", align 4
store i64 %"40", ptr addrspace(5) %"36", align 4
%"42" = load i64, ptr addrspace(5) %"35", align 4
%"50" = inttoptr i64 %"42" to ptr
%"41" = load i32, ptr %"50", align 4
store i32 %"41", ptr addrspace(5) %"37", align 4
%"43" = load i64, ptr addrspace(5) %"35", align 4
%"51" = inttoptr i64 %"43" to ptr
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
%"44" = load i32, ptr %"31", align 4
store i32 %"44", ptr addrspace(5) %"38", align 4
%"46" = load i32, ptr addrspace(5) %"37", align 4
%"47" = load i32, ptr addrspace(5) %"38", align 4
%"52" = and i32 %"46", %"47"
store i32 %"52", ptr addrspace(5) %"37", align 4
%"48" = load i64, ptr addrspace(5) %"36", align 4
%"49" = load i32, ptr addrspace(5) %"37", align 4
%"55" = inttoptr i64 %"48" to ptr
store i32 %"49", ptr %"55", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -0,0 +1,46 @@
@shared_mem = external addrspace(3) global [1024 x i8], align 4
define amdgpu_kernel void @atom_add(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
%"38" = alloca i64, align 8, addrspace(5)
%"39" = alloca i64, align 8, addrspace(5)
%"40" = alloca i32, align 4, addrspace(5)
%"41" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"35"
"35": ; preds = %1
%"42" = load i64, ptr addrspace(4) %"36", align 4
store i64 %"42", ptr addrspace(5) %"38", align 4
%"43" = load i64, ptr addrspace(4) %"37", align 4
store i64 %"43", ptr addrspace(5) %"39", align 4
%"45" = load i64, ptr addrspace(5) %"38", align 4
%"56" = inttoptr i64 %"45" to ptr
%"44" = load i32, ptr %"56", align 4
store i32 %"44", ptr addrspace(5) %"40", align 4
%"46" = load i64, ptr addrspace(5) %"38", align 4
%"57" = inttoptr i64 %"46" to ptr
%"32" = getelementptr inbounds i8, ptr %"57", i64 4
%"47" = load i32, ptr %"32", align 4
store i32 %"47", ptr addrspace(5) %"41", align 4
%"48" = load i32, ptr addrspace(5) %"40", align 4
store i32 %"48", ptr addrspace(3) @shared_mem, align 4
%"50" = load i32, ptr addrspace(5) %"41", align 4
%2 = atomicrmw add ptr addrspace(3) @shared_mem, i32 %"50" syncscope("agent-one-as") monotonic, align 4
store i32 %2, ptr addrspace(5) %"40", align 4
%"51" = load i32, ptr addrspace(3) @shared_mem, align 4
store i32 %"51", ptr addrspace(5) %"41", align 4
%"52" = load i64, ptr addrspace(5) %"39", align 4
%"53" = load i32, ptr addrspace(5) %"40", align 4
%"61" = inttoptr i64 %"52" to ptr
store i32 %"53", ptr %"61", align 4
%"54" = load i64, ptr addrspace(5) %"39", align 4
%"62" = inttoptr i64 %"54" to ptr
%"34" = getelementptr inbounds i8, ptr %"62", i64 4
%"55" = load i32, ptr addrspace(5) %"41", align 4
store i32 %"55", ptr %"34", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -0,0 +1,46 @@
@shared_mem = external addrspace(3) global [1024 x i8], align 4
define amdgpu_kernel void @atom_add_float(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
%"38" = alloca i64, align 8, addrspace(5)
%"39" = alloca i64, align 8, addrspace(5)
%"40" = alloca float, align 4, addrspace(5)
%"41" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"35"
"35": ; preds = %1
%"42" = load i64, ptr addrspace(4) %"36", align 4
store i64 %"42", ptr addrspace(5) %"38", align 4
%"43" = load i64, ptr addrspace(4) %"37", align 4
store i64 %"43", ptr addrspace(5) %"39", align 4
%"45" = load i64, ptr addrspace(5) %"38", align 4
%"56" = inttoptr i64 %"45" to ptr
%"44" = load float, ptr %"56", align 4
store float %"44", ptr addrspace(5) %"40", align 4
%"46" = load i64, ptr addrspace(5) %"38", align 4
%"57" = inttoptr i64 %"46" to ptr
%"32" = getelementptr inbounds i8, ptr %"57", i64 4
%"47" = load float, ptr %"32", align 4
store float %"47", ptr addrspace(5) %"41", align 4
%"48" = load float, ptr addrspace(5) %"40", align 4
store float %"48", ptr addrspace(3) @shared_mem, align 4
%"50" = load float, ptr addrspace(5) %"41", align 4
%2 = atomicrmw fadd ptr addrspace(3) @shared_mem, float %"50" syncscope("agent-one-as") monotonic, align 4
store float %2, ptr addrspace(5) %"40", align 4
%"51" = load float, ptr addrspace(3) @shared_mem, align 4
store float %"51", ptr addrspace(5) %"41", align 4
%"52" = load i64, ptr addrspace(5) %"39", align 4
%"53" = load float, ptr addrspace(5) %"40", align 4
%"61" = inttoptr i64 %"52" to ptr
store float %"53", ptr %"61", align 4
%"54" = load i64, ptr addrspace(5) %"39", align 4
%"62" = inttoptr i64 %"54" to ptr
%"34" = getelementptr inbounds i8, ptr %"62", i64 4
%"55" = load float, ptr addrspace(5) %"41", align 4
store float %"55", ptr %"34", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -0,0 +1,44 @@
define amdgpu_kernel void @atom_cas(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 {
%"40" = alloca i64, align 8, addrspace(5)
%"41" = alloca i64, align 8, addrspace(5)
%"42" = alloca i32, align 4, addrspace(5)
%"43" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"37"
"37": ; preds = %1
%"44" = load i64, ptr addrspace(4) %"38", align 4
store i64 %"44", ptr addrspace(5) %"40", align 4
%"45" = load i64, ptr addrspace(4) %"39", align 4
store i64 %"45", ptr addrspace(5) %"41", align 4
%"47" = load i64, ptr addrspace(5) %"40", align 4
%"57" = inttoptr i64 %"47" to ptr
%"46" = load i32, ptr %"57", align 4
store i32 %"46", ptr addrspace(5) %"42", align 4
%"48" = load i64, ptr addrspace(5) %"40", align 4
%"58" = inttoptr i64 %"48" to ptr
%"31" = getelementptr inbounds i8, ptr %"58", i64 4
%"50" = load i32, ptr addrspace(5) %"42", align 4
%2 = cmpxchg ptr %"31", i32 %"50", i32 100 syncscope("agent-one-as") monotonic monotonic, align 4
%"59" = extractvalue { i32, i1 } %2, 0
store i32 %"59", ptr addrspace(5) %"42", align 4
%"51" = load i64, ptr addrspace(5) %"40", align 4
%"61" = inttoptr i64 %"51" to ptr
%"34" = getelementptr inbounds i8, ptr %"61", i64 4
%"52" = load i32, ptr %"34", align 4
store i32 %"52", ptr addrspace(5) %"43", align 4
%"53" = load i64, ptr addrspace(5) %"41", align 4
%"54" = load i32, ptr addrspace(5) %"42", align 4
%"62" = inttoptr i64 %"53" to ptr
store i32 %"54", ptr %"62", align 4
%"55" = load i64, ptr addrspace(5) %"41", align 4
%"63" = inttoptr i64 %"55" to ptr
%"36" = getelementptr inbounds i8, ptr %"63", i64 4
%"56" = load i32, ptr addrspace(5) %"43", align 4
store i32 %"56", ptr %"36", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -0,0 +1,46 @@
define amdgpu_kernel void @atom_inc(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 {
%"40" = alloca i64, align 8, addrspace(5)
%"41" = alloca i64, align 8, addrspace(5)
%"42" = alloca i32, align 4, addrspace(5)
%"43" = alloca i32, align 4, addrspace(5)
%"44" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"37"
"37": ; preds = %1
%"45" = load i64, ptr addrspace(4) %"38", align 4
store i64 %"45", ptr addrspace(5) %"40", align 4
%"46" = load i64, ptr addrspace(4) %"39", align 4
store i64 %"46", ptr addrspace(5) %"41", align 4
%"48" = load i64, ptr addrspace(5) %"40", align 4
%"59" = inttoptr i64 %"48" to ptr
%2 = atomicrmw uinc_wrap ptr %"59", i32 101 syncscope("agent-one-as") monotonic, align 4
store i32 %2, ptr addrspace(5) %"42", align 4
%"50" = load i64, ptr addrspace(5) %"40", align 4
%"60" = inttoptr i64 %"50" to ptr addrspace(1)
%3 = atomicrmw uinc_wrap ptr addrspace(1) %"60", i32 101 syncscope("agent-one-as") monotonic, align 4
store i32 %3, ptr addrspace(5) %"43", align 4
%"52" = load i64, ptr addrspace(5) %"40", align 4
%"61" = inttoptr i64 %"52" to ptr
%"51" = load i32, ptr %"61", align 4
store i32 %"51", ptr addrspace(5) %"44", align 4
%"53" = load i64, ptr addrspace(5) %"41", align 4
%"54" = load i32, ptr addrspace(5) %"42", align 4
%"62" = inttoptr i64 %"53" to ptr
store i32 %"54", ptr %"62", align 4
%"55" = load i64, ptr addrspace(5) %"41", align 4
%"63" = inttoptr i64 %"55" to ptr
%"34" = getelementptr inbounds i8, ptr %"63", i64 4
%"56" = load i32, ptr addrspace(5) %"43", align 4
store i32 %"56", ptr %"34", align 4
%"57" = load i64, ptr addrspace(5) %"41", align 4
%"64" = inttoptr i64 %"57" to ptr
%"36" = getelementptr inbounds i8, ptr %"64", i64 8
%"58" = load i32, ptr addrspace(5) %"44", align 4
store i32 %"58", ptr %"36", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -0,0 +1,30 @@
define amdgpu_kernel void @b64tof64(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
%"33" = alloca double, align 8, addrspace(5)
%"34" = alloca i64, align 8, addrspace(5)
%"35" = alloca i64, align 8, addrspace(5)
%"36" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"30"
"30": ; preds = %1
%"37" = load double, ptr addrspace(4) %"31", align 8
store double %"37", ptr addrspace(5) %"33", align 8
%"38" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"38", ptr addrspace(5) %"35", align 4
%"40" = load double, ptr addrspace(5) %"33", align 8
%"46" = bitcast double %"40" to i64
store i64 %"46", ptr addrspace(5) %"34", align 4
%"42" = load i64, ptr addrspace(5) %"34", align 4
%"47" = inttoptr i64 %"42" to ptr
%"41" = load i64, ptr %"47", align 4
store i64 %"41", ptr addrspace(5) %"36", align 4
%"43" = load i64, ptr addrspace(5) %"35", align 4
%"44" = load i64, ptr addrspace(5) %"36", align 4
%"48" = inttoptr i64 %"43" to ptr
store i64 %"44", ptr %"48", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

91
ptx/src/test/ll/bench.ll Normal file
View File

@ -0,0 +1,91 @@
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
declare i32 @__zluda_ptx_impl_sreg_clock() #0
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
define amdgpu_kernel void @bench(ptr addrspace(4) byref(i64) %"55", ptr addrspace(4) byref(i64) %"56") #1 {
%"57" = alloca i64, align 8, addrspace(5)
%"58" = alloca i64, align 8, addrspace(5)
%"59" = alloca float, align 4, addrspace(5)
%"60" = alloca float, align 4, addrspace(5)
%"61" = alloca float, align 4, addrspace(5)
%"62" = alloca float, align 4, addrspace(5)
%"63" = alloca i32, align 4, addrspace(5)
%"64" = alloca i1, align 1, addrspace(5)
br label %1
1: ; preds = %0
br label %"97"
"97": ; preds = %1
%"65" = load i64, ptr addrspace(4) %"55", align 4
store i64 %"65", ptr addrspace(5) %"57", align 4
%"66" = load i64, ptr addrspace(4) %"56", align 4
store i64 %"66", ptr addrspace(5) %"58", align 4
%"68" = load i64, ptr addrspace(5) %"57", align 4
%"91" = inttoptr i64 %"68" to ptr
%"67" = load float, ptr %"91", align 4
store float %"67", ptr addrspace(5) %"59", align 4
%"69" = load i64, ptr addrspace(5) %"57", align 4
%"92" = inttoptr i64 %"69" to ptr
%"39" = getelementptr inbounds i8, ptr %"92", i64 4
%"70" = load float, ptr %"39", align 4
store float %"70", ptr addrspace(5) %"60", align 4
%"71" = load i64, ptr addrspace(5) %"57", align 4
%"93" = inttoptr i64 %"71" to ptr
%"41" = getelementptr inbounds i8, ptr %"93", i64 8
%"72" = load float, ptr %"41", align 4
store float %"72", ptr addrspace(5) %"61", align 4
%"73" = load i64, ptr addrspace(5) %"57", align 4
%"94" = inttoptr i64 %"73" to ptr
%"43" = getelementptr inbounds i8, ptr %"94", i64 12
%"74" = load float, ptr %"43", align 4
store float %"74", ptr addrspace(5) %"62", align 4
store i32 0, ptr addrspace(5) %"63", align 4
br label %"10"
"10": ; preds = %"21", %"97"
%"77" = load float, ptr addrspace(5) %"59", align 4
%"78" = load float, ptr addrspace(5) %"60", align 4
call void asm sideeffect "s_denorm_mode 0", "~{mode}"()
%"76" = fmul float %"77", %"78"
store float %"76", ptr addrspace(5) %"59", align 4
%"80" = load float, ptr addrspace(5) %"61", align 4
%"81" = load float, ptr addrspace(5) %"62", align 4
call void asm sideeffect "s_denorm_mode 11", "~{mode}"()
%"79" = fmul float %"80", %"81"
store float %"79", ptr addrspace(5) %"61", align 4
%"83" = load i32, ptr addrspace(5) %"63", align 4
%"82" = add i32 %"83", 1
store i32 %"82", ptr addrspace(5) %"63", align 4
%"85" = load i32, ptr addrspace(5) %"63", align 4
%"84" = icmp eq i32 %"85", 100000000
store i1 %"84", ptr addrspace(5) %"64", align 1
%"86" = load i1, ptr addrspace(5) %"64", align 1
br i1 %"86", label %"11", label %"21"
"21": ; preds = %"10"
br label %"10"
"11": ; preds = %"10"
%"87" = load i64, ptr addrspace(5) %"58", align 4
%"88" = load float, ptr addrspace(5) %"59", align 4
%"95" = inttoptr i64 %"87" to ptr
store float %"88", ptr %"95", align 4
%"89" = load i64, ptr addrspace(5) %"58", align 4
%"96" = inttoptr i64 %"89" to ptr
%"48" = getelementptr inbounds i8, ptr %"96", i64 4
%"90" = load float, ptr addrspace(5) %"61", align 4
store float %"90", ptr %"48", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

46
ptx/src/test/ll/bfe.ll Normal file
View File

@ -0,0 +1,46 @@
declare i32 @__zluda_ptx_impl_bfe_u32(i32, i32, i32) #0
define amdgpu_kernel void @bfe(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 {
%"38" = alloca i64, align 8, addrspace(5)
%"39" = alloca i64, align 8, addrspace(5)
%"40" = alloca i32, align 4, addrspace(5)
%"41" = alloca i32, align 4, addrspace(5)
%"42" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"35"
"35": ; preds = %1
%"43" = load i64, ptr addrspace(4) %"36", align 4
store i64 %"43", ptr addrspace(5) %"38", align 4
%"44" = load i64, ptr addrspace(4) %"37", align 4
store i64 %"44", ptr addrspace(5) %"39", align 4
%"46" = load i64, ptr addrspace(5) %"38", align 4
%"57" = inttoptr i64 %"46" to ptr
%"45" = load i32, ptr %"57", align 4
store i32 %"45", ptr addrspace(5) %"40", align 4
%"47" = load i64, ptr addrspace(5) %"38", align 4
%"58" = inttoptr i64 %"47" to ptr
%"32" = getelementptr inbounds i8, ptr %"58", i64 4
%"48" = load i32, ptr %"32", align 4
store i32 %"48", ptr addrspace(5) %"41", align 4
%"49" = load i64, ptr addrspace(5) %"38", align 4
%"59" = inttoptr i64 %"49" to ptr
%"34" = getelementptr inbounds i8, ptr %"59", i64 8
%"50" = load i32, ptr %"34", align 4
store i32 %"50", ptr addrspace(5) %"42", align 4
%"52" = load i32, ptr addrspace(5) %"40", align 4
%"53" = load i32, ptr addrspace(5) %"41", align 4
%"54" = load i32, ptr addrspace(5) %"42", align 4
%"51" = call i32 @__zluda_ptx_impl_bfe_u32(i32 %"52", i32 %"53", i32 %"54")
store i32 %"51", ptr addrspace(5) %"40", align 4
%"55" = load i64, ptr addrspace(5) %"39", align 4
%"56" = load i32, ptr addrspace(5) %"40", align 4
%"60" = inttoptr i64 %"55" to ptr
store i32 %"56", ptr %"60", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

53
ptx/src/test/ll/bfi.ll Normal file
View File

@ -0,0 +1,53 @@
declare i32 @__zluda_ptx_impl_bfi_b32(i32, i32, i32, i32) #0
define amdgpu_kernel void @bfi(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 {
%"41" = alloca i64, align 8, addrspace(5)
%"42" = alloca i64, align 8, addrspace(5)
%"43" = alloca i32, align 4, addrspace(5)
%"44" = alloca i32, align 4, addrspace(5)
%"45" = alloca i32, align 4, addrspace(5)
%"46" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"38"
"38": ; preds = %1
%"47" = load i64, ptr addrspace(4) %"39", align 4
store i64 %"47", ptr addrspace(5) %"41", align 4
%"48" = load i64, ptr addrspace(4) %"40", align 4
store i64 %"48", ptr addrspace(5) %"42", align 4
%"50" = load i64, ptr addrspace(5) %"41", align 4
%"64" = inttoptr i64 %"50" to ptr
%"49" = load i32, ptr %"64", align 4
store i32 %"49", ptr addrspace(5) %"43", align 4
%"51" = load i64, ptr addrspace(5) %"41", align 4
%"65" = inttoptr i64 %"51" to ptr
%"33" = getelementptr inbounds i8, ptr %"65", i64 4
%"52" = load i32, ptr %"33", align 4
store i32 %"52", ptr addrspace(5) %"44", align 4
%"53" = load i64, ptr addrspace(5) %"41", align 4
%"66" = inttoptr i64 %"53" to ptr
%"35" = getelementptr inbounds i8, ptr %"66", i64 8
%"54" = load i32, ptr %"35", align 4
store i32 %"54", ptr addrspace(5) %"45", align 4
%"55" = load i64, ptr addrspace(5) %"41", align 4
%"67" = inttoptr i64 %"55" to ptr
%"37" = getelementptr inbounds i8, ptr %"67", i64 12
%"56" = load i32, ptr %"37", align 4
store i32 %"56", ptr addrspace(5) %"46", align 4
%"58" = load i32, ptr addrspace(5) %"43", align 4
%"59" = load i32, ptr addrspace(5) %"44", align 4
%"60" = load i32, ptr addrspace(5) %"45", align 4
%"61" = load i32, ptr addrspace(5) %"46", align 4
%"68" = call i32 @__zluda_ptx_impl_bfi_b32(i32 %"58", i32 %"59", i32 %"60", i32 %"61")
store i32 %"68", ptr addrspace(5) %"43", align 4
%"62" = load i64, ptr addrspace(5) %"42", align 4
%"63" = load i32, ptr addrspace(5) %"43", align 4
%"71" = inttoptr i64 %"62" to ptr
store i32 %"63", ptr %"71", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

34
ptx/src/test/ll/block.ll Normal file
View File

@ -0,0 +1,34 @@
define amdgpu_kernel void @block(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
%"36" = alloca i64, align 8, addrspace(5)
%"37" = alloca i64, align 8, addrspace(5)
%"38" = alloca i64, align 8, addrspace(5)
%"39" = alloca i64, align 8, addrspace(5)
%"46" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"33"
"33": ; preds = %1
%"40" = load i64, ptr addrspace(4) %"34", align 4
store i64 %"40", ptr addrspace(5) %"36", align 4
%"41" = load i64, ptr addrspace(4) %"35", align 4
store i64 %"41", ptr addrspace(5) %"37", align 4
%"43" = load i64, ptr addrspace(5) %"36", align 4
%"51" = inttoptr i64 %"43" to ptr
%"42" = load i64, ptr %"51", align 4
store i64 %"42", ptr addrspace(5) %"38", align 4
%"45" = load i64, ptr addrspace(5) %"38", align 4
%"44" = add i64 %"45", 1
store i64 %"44", ptr addrspace(5) %"39", align 4
%"48" = load i64, ptr addrspace(5) %"46", align 4
%"47" = add i64 %"48", 1
store i64 %"47", ptr addrspace(5) %"46", align 4
%"49" = load i64, ptr addrspace(5) %"37", align 4
%"50" = load i64, ptr addrspace(5) %"39", align 4
%"52" = inttoptr i64 %"49" to ptr
store i64 %"50", ptr %"52", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

36
ptx/src/test/ll/bra.ll Normal file
View File

@ -0,0 +1,36 @@
define amdgpu_kernel void @bra(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
%"38" = alloca i64, align 8, addrspace(5)
%"39" = alloca i64, align 8, addrspace(5)
%"40" = alloca i64, align 8, addrspace(5)
%"41" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"35"
"35": ; preds = %1
%"42" = load i64, ptr addrspace(4) %"36", align 4
store i64 %"42", ptr addrspace(5) %"38", align 4
%"43" = load i64, ptr addrspace(4) %"37", align 4
store i64 %"43", ptr addrspace(5) %"39", align 4
%"45" = load i64, ptr addrspace(5) %"38", align 4
%"50" = inttoptr i64 %"45" to ptr
%"44" = load i64, ptr %"50", align 4
store i64 %"44", ptr addrspace(5) %"40", align 4
br label %"10"
"10": ; preds = %"35"
%"47" = load i64, ptr addrspace(5) %"40", align 4
%"46" = add i64 %"47", 1
store i64 %"46", ptr addrspace(5) %"41", align 4
br label %"12"
"12": ; preds = %"10"
%"48" = load i64, ptr addrspace(5) %"39", align 4
%"49" = load i64, ptr addrspace(5) %"41", align 4
%"51" = inttoptr i64 %"48" to ptr
store i64 %"49", ptr %"51", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

33
ptx/src/test/ll/brev.ll Normal file
View File

@ -0,0 +1,33 @@
define amdgpu_kernel void @brev(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
%"32" = alloca i64, align 8, addrspace(5)
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"29"
"29": ; preds = %1
%"35" = load i64, ptr addrspace(4) %"30", align 4
store i64 %"35", ptr addrspace(5) %"32", align 4
%"36" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"36", ptr addrspace(5) %"33", align 4
%"38" = load i64, ptr addrspace(5) %"32", align 4
%"43" = inttoptr i64 %"38" to ptr
%"37" = load i32, ptr %"43", align 4
store i32 %"37", ptr addrspace(5) %"34", align 4
%"40" = load i32, ptr addrspace(5) %"34", align 4
%"39" = call i32 @llvm.bitreverse.i32(i32 %"40")
store i32 %"39", ptr addrspace(5) %"34", align 4
%"41" = load i64, ptr addrspace(5) %"33", align 4
%"42" = load i32, ptr addrspace(5) %"34", align 4
%"44" = inttoptr i64 %"41" to ptr
store i32 %"42", ptr %"44", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.bitreverse.i32(i32) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

64
ptx/src/test/ll/call.ll Normal file
View File

@ -0,0 +1,64 @@
define i64 @incr(i64 %"43") #0 {
%"63" = alloca i64, align 8, addrspace(5)
%"64" = alloca i64, align 8, addrspace(5)
%"65" = alloca i64, align 8, addrspace(5)
%"66" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"46"
"46": ; preds = %1
store i64 %"43", ptr addrspace(5) %"65", align 4
%"67" = load i64, ptr addrspace(5) %"65", align 4
store i64 %"67", ptr addrspace(5) %"66", align 4
%"69" = load i64, ptr addrspace(5) %"66", align 4
%"68" = add i64 %"69", 1
store i64 %"68", ptr addrspace(5) %"66", align 4
%"70" = load i64, ptr addrspace(5) %"66", align 4
store i64 %"70", ptr addrspace(5) %"64", align 4
%"71" = load i64, ptr addrspace(5) %"64", align 4
store i64 %"71", ptr addrspace(5) %"63", align 4
%2 = load i64, ptr addrspace(5) %"63", align 4
ret i64 %2
}
define amdgpu_kernel void @call(ptr addrspace(4) byref(i64) %"48", ptr addrspace(4) byref(i64) %"49") #1 {
%"50" = alloca i64, align 8, addrspace(5)
%"51" = alloca i64, align 8, addrspace(5)
%"52" = alloca i64, align 8, addrspace(5)
%"57" = alloca i64, align 8, addrspace(5)
%"58" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"44"
"44": ; preds = %1
%"53" = load i64, ptr addrspace(4) %"48", align 4
store i64 %"53", ptr addrspace(5) %"50", align 4
%"54" = load i64, ptr addrspace(4) %"49", align 4
store i64 %"54", ptr addrspace(5) %"51", align 4
%"56" = load i64, ptr addrspace(5) %"50", align 4
%"72" = inttoptr i64 %"56" to ptr addrspace(1)
%"55" = load i64, ptr addrspace(1) %"72", align 4
store i64 %"55", ptr addrspace(5) %"52", align 4
%"59" = load i64, ptr addrspace(5) %"52", align 4
store i64 %"59", ptr addrspace(5) %"57", align 4
%"40" = load i64, ptr addrspace(5) %"57", align 4
%"41" = call i64 @incr(i64 %"40")
br label %"45"
"45": ; preds = %"44"
store i64 %"41", ptr addrspace(5) %"58", align 4
%"60" = load i64, ptr addrspace(5) %"58", align 4
store i64 %"60", ptr addrspace(5) %"52", align 4
%"61" = load i64, ptr addrspace(5) %"51", align 4
%"62" = load i64, ptr addrspace(5) %"52", align 4
%"75" = inttoptr i64 %"61" to ptr addrspace(1)
store i64 %"62", ptr addrspace(1) %"75", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

155
ptx/src/test/ll/call_rnd.ll Normal file
View File

@ -0,0 +1,155 @@
define float @add_rm(float %"79", float %"80") #0 {
%"128" = alloca float, align 4, addrspace(5)
%"129" = alloca float, align 4, addrspace(5)
%"130" = alloca float, align 4, addrspace(5)
%"131" = alloca float, align 4, addrspace(5)
%"132" = alloca float, align 4, addrspace(5)
%"133" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"89"
"89": ; preds = %1
call void @llvm.amdgcn.s.setreg(i32 6145, i32 2)
br label %"87"
"87": ; preds = %"89"
store float %"79", ptr addrspace(5) %"130", align 4
store float %"80", ptr addrspace(5) %"131", align 4
%"134" = load float, ptr addrspace(5) %"130", align 4
store float %"134", ptr addrspace(5) %"132", align 4
%"135" = load float, ptr addrspace(5) %"131", align 4
store float %"135", ptr addrspace(5) %"133", align 4
%"137" = load float, ptr addrspace(5) %"132", align 4
%"138" = load float, ptr addrspace(5) %"133", align 4
%"136" = fadd float %"137", %"138"
store float %"136", ptr addrspace(5) %"132", align 4
%"139" = load float, ptr addrspace(5) %"132", align 4
store float %"139", ptr addrspace(5) %"129", align 4
%"140" = load float, ptr addrspace(5) %"129", align 4
store float %"140", ptr addrspace(5) %"128", align 4
%2 = load float, ptr addrspace(5) %"128", align 4
ret float %2
}
define float @add_rp(float %"82", float %"83") #0 {
%"141" = alloca float, align 4, addrspace(5)
%"142" = alloca float, align 4, addrspace(5)
%"143" = alloca float, align 4, addrspace(5)
%"144" = alloca float, align 4, addrspace(5)
%"145" = alloca float, align 4, addrspace(5)
%"146" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"88"
"88": ; preds = %1
store float %"82", ptr addrspace(5) %"143", align 4
store float %"83", ptr addrspace(5) %"144", align 4
%"147" = load float, ptr addrspace(5) %"143", align 4
store float %"147", ptr addrspace(5) %"145", align 4
%"148" = load float, ptr addrspace(5) %"144", align 4
store float %"148", ptr addrspace(5) %"146", align 4
%"150" = load float, ptr addrspace(5) %"145", align 4
%"151" = load float, ptr addrspace(5) %"146", align 4
%"149" = fadd float %"150", %"151"
store float %"149", ptr addrspace(5) %"145", align 4
%"152" = load float, ptr addrspace(5) %"145", align 4
store float %"152", ptr addrspace(5) %"142", align 4
%"153" = load float, ptr addrspace(5) %"142", align 4
store float %"153", ptr addrspace(5) %"141", align 4
%2 = load float, ptr addrspace(5) %"141", align 4
ret float %2
}
define amdgpu_kernel void @call_rnd(ptr addrspace(4) byref(i64) %"92", ptr addrspace(4) byref(i64) %"93") #1 {
%"94" = alloca i64, align 8, addrspace(5)
%"95" = alloca i64, align 8, addrspace(5)
%"96" = alloca float, align 4, addrspace(5)
%"97" = alloca float, align 4, addrspace(5)
%"98" = alloca float, align 4, addrspace(5)
%"99" = alloca float, align 4, addrspace(5)
%"100" = alloca float, align 4, addrspace(5)
%"101" = alloca float, align 4, addrspace(5)
%"102" = alloca float, align 4, addrspace(5)
%"103" = alloca float, align 4, addrspace(5)
%"104" = alloca float, align 4, addrspace(5)
%"105" = alloca float, align 4, addrspace(5)
%"106" = alloca float, align 4, addrspace(5)
%"107" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"84"
"84": ; preds = %1
call void @llvm.amdgcn.s.setreg(i32 6145, i32 1)
%"108" = load i64, ptr addrspace(4) %"92", align 4
store i64 %"108", ptr addrspace(5) %"94", align 4
%"109" = load i64, ptr addrspace(4) %"93", align 4
store i64 %"109", ptr addrspace(5) %"95", align 4
%"111" = load i64, ptr addrspace(5) %"94", align 4
%"154" = inttoptr i64 %"111" to ptr
%"110" = load float, ptr %"154", align 4
store float %"110", ptr addrspace(5) %"96", align 4
%"112" = load i64, ptr addrspace(5) %"94", align 4
%"155" = inttoptr i64 %"112" to ptr
%"59" = getelementptr inbounds i8, ptr %"155", i64 4
%"113" = load float, ptr %"59", align 4
store float %"113", ptr addrspace(5) %"97", align 4
%"114" = load i64, ptr addrspace(5) %"94", align 4
%"156" = inttoptr i64 %"114" to ptr
%"61" = getelementptr inbounds i8, ptr %"156", i64 8
%"115" = load float, ptr %"61", align 4
store float %"115", ptr addrspace(5) %"98", align 4
%"116" = load i64, ptr addrspace(5) %"94", align 4
%"157" = inttoptr i64 %"116" to ptr
%"63" = getelementptr inbounds i8, ptr %"157", i64 12
%"117" = load float, ptr %"63", align 4
store float %"117", ptr addrspace(5) %"99", align 4
%"118" = load float, ptr addrspace(5) %"96", align 4
store float %"118", ptr addrspace(5) %"102", align 4
%"119" = load float, ptr addrspace(5) %"97", align 4
store float %"119", ptr addrspace(5) %"103", align 4
%"72" = load float, ptr addrspace(5) %"102", align 4
%"73" = load float, ptr addrspace(5) %"103", align 4
%"74" = call float @add_rp(float %"72", float %"73")
br label %"85"
"85": ; preds = %"84"
store float %"74", ptr addrspace(5) %"104", align 4
%"120" = load float, ptr addrspace(5) %"104", align 4
store float %"120", ptr addrspace(5) %"100", align 4
%"121" = load i64, ptr addrspace(5) %"95", align 4
%"122" = load float, ptr addrspace(5) %"100", align 4
%"158" = inttoptr i64 %"121" to ptr
store float %"122", ptr %"158", align 4
%"123" = load float, ptr addrspace(5) %"98", align 4
store float %"123", ptr addrspace(5) %"105", align 4
%"124" = load float, ptr addrspace(5) %"99", align 4
store float %"124", ptr addrspace(5) %"106", align 4
%"75" = load float, ptr addrspace(5) %"105", align 4
%"76" = load float, ptr addrspace(5) %"106", align 4
%"77" = call float @add_rm(float %"75", float %"76")
br label %"86"
"86": ; preds = %"85"
store float %"77", ptr addrspace(5) %"107", align 4
%"125" = load float, ptr addrspace(5) %"107", align 4
store float %"125", ptr addrspace(5) %"101", align 4
%"126" = load i64, ptr addrspace(5) %"95", align 4
%"159" = inttoptr i64 %"126" to ptr
%"65" = getelementptr inbounds i8, ptr %"159", i64 4
%"127" = load float, ptr addrspace(5) %"101", align 4
store float %"127", ptr %"65", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind willreturn
declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #2
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #2 = { nocallback nofree nosync nounwind willreturn }

33
ptx/src/test/ll/clz.ll Normal file
View File

@ -0,0 +1,33 @@
define amdgpu_kernel void @clz(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
%"32" = alloca i64, align 8, addrspace(5)
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"29"
"29": ; preds = %1
%"35" = load i64, ptr addrspace(4) %"30", align 4
store i64 %"35", ptr addrspace(5) %"32", align 4
%"36" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"36", ptr addrspace(5) %"33", align 4
%"38" = load i64, ptr addrspace(5) %"32", align 4
%"43" = inttoptr i64 %"38" to ptr
%"37" = load i32, ptr %"43", align 4
store i32 %"37", ptr addrspace(5) %"34", align 4
%"40" = load i32, ptr addrspace(5) %"34", align 4
%"44" = call i32 @llvm.ctlz.i32(i32 %"40", i1 false)
store i32 %"44", ptr addrspace(5) %"34", align 4
%"41" = load i64, ptr addrspace(5) %"33", align 4
%"42" = load i32, ptr addrspace(5) %"34", align 4
%"45" = inttoptr i64 %"41" to ptr
store i32 %"42", ptr %"45", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.ctlz.i32(i32, i1 immarg) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

50
ptx/src/test/ll/const.ll Normal file
View File

@ -0,0 +1,50 @@
@constparams = addrspace(4) global [4 x i16] [i16 10, i16 20, i16 30, i16 40], align 8
define amdgpu_kernel void @const(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #0 {
%"48" = alloca i64, align 8, addrspace(5)
%"49" = alloca i64, align 8, addrspace(5)
%"50" = alloca i16, align 2, addrspace(5)
%"51" = alloca i16, align 2, addrspace(5)
%"52" = alloca i16, align 2, addrspace(5)
%"53" = alloca i16, align 2, addrspace(5)
br label %1
1: ; preds = %0
br label %"45"
"45": ; preds = %1
%"54" = load i64, ptr addrspace(4) %"46", align 4
store i64 %"54", ptr addrspace(5) %"48", align 4
%"55" = load i64, ptr addrspace(4) %"47", align 4
store i64 %"55", ptr addrspace(5) %"49", align 4
%"56" = load i16, ptr addrspace(4) @constparams, align 2
store i16 %"56", ptr addrspace(5) %"50", align 2
%"57" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 2), align 2
store i16 %"57", ptr addrspace(5) %"51", align 2
%"58" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 4), align 2
store i16 %"58", ptr addrspace(5) %"52", align 2
%"59" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 6), align 2
store i16 %"59", ptr addrspace(5) %"53", align 2
%"60" = load i64, ptr addrspace(5) %"49", align 4
%"61" = load i16, ptr addrspace(5) %"50", align 2
%"72" = inttoptr i64 %"60" to ptr
store i16 %"61", ptr %"72", align 2
%"62" = load i64, ptr addrspace(5) %"49", align 4
%"74" = inttoptr i64 %"62" to ptr
%"40" = getelementptr inbounds i8, ptr %"74", i64 2
%"63" = load i16, ptr addrspace(5) %"51", align 2
store i16 %"63", ptr %"40", align 2
%"64" = load i64, ptr addrspace(5) %"49", align 4
%"76" = inttoptr i64 %"64" to ptr
%"42" = getelementptr inbounds i8, ptr %"76", i64 4
%"65" = load i16, ptr addrspace(5) %"52", align 2
store i16 %"65", ptr %"42", align 2
%"66" = load i64, ptr addrspace(5) %"49", align 4
%"78" = inttoptr i64 %"66" to ptr
%"44" = getelementptr inbounds i8, ptr %"78", i64 6
%"67" = load i16, ptr addrspace(5) %"53", align 2
store i16 %"67", ptr %"44", align 2
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -0,0 +1,29 @@
define amdgpu_kernel void @constant_f32(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca i64, align 8, addrspace(5)
%"35" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"30"
"30": ; preds = %1
%"36" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"36", ptr addrspace(5) %"33", align 4
%"37" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"37", ptr addrspace(5) %"34", align 4
%"39" = load i64, ptr addrspace(5) %"33", align 4
%"44" = inttoptr i64 %"39" to ptr
%"38" = load float, ptr %"44", align 4
store float %"38", ptr addrspace(5) %"35", align 4
%"41" = load float, ptr addrspace(5) %"35", align 4
%"40" = fmul float %"41", 5.000000e-01
store float %"40", ptr addrspace(5) %"35", align 4
%"42" = load i64, ptr addrspace(5) %"34", align 4
%"43" = load float, ptr addrspace(5) %"35", align 4
%"45" = inttoptr i64 %"42" to ptr
store float %"43", ptr %"45", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -0,0 +1,29 @@
define amdgpu_kernel void @constant_negative(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca i64, align 8, addrspace(5)
%"35" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"30"
"30": ; preds = %1
%"36" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"36", ptr addrspace(5) %"33", align 4
%"37" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"37", ptr addrspace(5) %"34", align 4
%"39" = load i64, ptr addrspace(5) %"33", align 4
%"44" = inttoptr i64 %"39" to ptr
%"38" = load i32, ptr %"44", align 4
store i32 %"38", ptr addrspace(5) %"35", align 4
%"41" = load i32, ptr addrspace(5) %"35", align 4
%"40" = mul i32 %"41", -1
store i32 %"40", ptr addrspace(5) %"35", align 4
%"42" = load i64, ptr addrspace(5) %"34", align 4
%"43" = load i32, ptr addrspace(5) %"35", align 4
%"45" = inttoptr i64 %"42" to ptr
store i32 %"43", ptr %"45", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

33
ptx/src/test/ll/cos.ll Normal file
View File

@ -0,0 +1,33 @@
define amdgpu_kernel void @cos(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
%"32" = alloca i64, align 8, addrspace(5)
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"29"
"29": ; preds = %1
%"35" = load i64, ptr addrspace(4) %"30", align 4
store i64 %"35", ptr addrspace(5) %"32", align 4
%"36" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"36", ptr addrspace(5) %"33", align 4
%"38" = load i64, ptr addrspace(5) %"32", align 4
%"43" = inttoptr i64 %"38" to ptr
%"37" = load float, ptr %"43", align 4
store float %"37", ptr addrspace(5) %"34", align 4
%"40" = load float, ptr addrspace(5) %"34", align 4
%"39" = call afn float @llvm.cos.f32(float %"40")
store float %"39", ptr addrspace(5) %"34", align 4
%"41" = load i64, ptr addrspace(5) %"33", align 4
%"42" = load float, ptr addrspace(5) %"34", align 4
%"44" = inttoptr i64 %"41" to ptr
store float %"42", ptr %"44", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.cos.f32(float) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

View File

@ -0,0 +1,30 @@
define amdgpu_kernel void @cvt_f64_f32(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca i64, align 8, addrspace(5)
%"35" = alloca float, align 4, addrspace(5)
%"36" = alloca double, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"30"
"30": ; preds = %1
%"37" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"37", ptr addrspace(5) %"33", align 4
%"38" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"38", ptr addrspace(5) %"34", align 4
%"40" = load i64, ptr addrspace(5) %"33", align 4
%"45" = inttoptr i64 %"40" to ptr addrspace(1)
%"39" = load float, ptr addrspace(1) %"45", align 4
store float %"39", ptr addrspace(5) %"35", align 4
%"42" = load float, ptr addrspace(5) %"35", align 4
%"41" = fpext float %"42" to double
store double %"41", ptr addrspace(5) %"36", align 8
%"43" = load i64, ptr addrspace(5) %"34", align 4
%"44" = load double, ptr addrspace(5) %"36", align 8
%"46" = inttoptr i64 %"43" to ptr
store double %"44", ptr %"46", align 8
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -0,0 +1,49 @@
define amdgpu_kernel void @cvt_rni(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
%"37" = alloca i64, align 8, addrspace(5)
%"38" = alloca i64, align 8, addrspace(5)
%"39" = alloca float, align 4, addrspace(5)
%"40" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"34"
"34": ; preds = %1
%"41" = load i64, ptr addrspace(4) %"35", align 4
store i64 %"41", ptr addrspace(5) %"37", align 4
%"42" = load i64, ptr addrspace(4) %"36", align 4
store i64 %"42", ptr addrspace(5) %"38", align 4
%"44" = load i64, ptr addrspace(5) %"37", align 4
%"55" = inttoptr i64 %"44" to ptr
%"43" = load float, ptr %"55", align 4
store float %"43", ptr addrspace(5) %"39", align 4
%"45" = load i64, ptr addrspace(5) %"37", align 4
%"56" = inttoptr i64 %"45" to ptr
%"31" = getelementptr inbounds i8, ptr %"56", i64 4
%"46" = load float, ptr %"31", align 4
store float %"46", ptr addrspace(5) %"40", align 4
%"48" = load float, ptr addrspace(5) %"39", align 4
%2 = call float @llvm.roundeven.f32(float %"48")
%"47" = freeze float %2
store float %"47", ptr addrspace(5) %"39", align 4
%"50" = load float, ptr addrspace(5) %"40", align 4
%3 = call float @llvm.roundeven.f32(float %"50")
%"49" = freeze float %3
store float %"49", ptr addrspace(5) %"40", align 4
%"51" = load i64, ptr addrspace(5) %"38", align 4
%"52" = load float, ptr addrspace(5) %"39", align 4
%"57" = inttoptr i64 %"51" to ptr
store float %"52", ptr %"57", align 4
%"53" = load i64, ptr addrspace(5) %"38", align 4
%"58" = inttoptr i64 %"53" to ptr
%"33" = getelementptr inbounds i8, ptr %"58", i64 4
%"54" = load float, ptr addrspace(5) %"40", align 4
store float %"54", ptr %"33", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.roundeven.f32(float) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

View File

@ -0,0 +1,54 @@
define amdgpu_kernel void @cvt_rzi(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
%"37" = alloca i64, align 8, addrspace(5)
%"38" = alloca i64, align 8, addrspace(5)
%"39" = alloca float, align 4, addrspace(5)
%"40" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"34"
"34": ; preds = %1
call void @llvm.amdgcn.s.setreg(i32 6145, i32 3)
%"41" = load i64, ptr addrspace(4) %"35", align 4
store i64 %"41", ptr addrspace(5) %"37", align 4
%"42" = load i64, ptr addrspace(4) %"36", align 4
store i64 %"42", ptr addrspace(5) %"38", align 4
%"44" = load i64, ptr addrspace(5) %"37", align 4
%"55" = inttoptr i64 %"44" to ptr
%"43" = load float, ptr %"55", align 4
store float %"43", ptr addrspace(5) %"39", align 4
%"45" = load i64, ptr addrspace(5) %"37", align 4
%"56" = inttoptr i64 %"45" to ptr
%"31" = getelementptr inbounds i8, ptr %"56", i64 4
%"46" = load float, ptr %"31", align 4
store float %"46", ptr addrspace(5) %"40", align 4
%"48" = load float, ptr addrspace(5) %"39", align 4
%2 = call float @llvm.trunc.f32(float %"48")
%"47" = freeze float %2
store float %"47", ptr addrspace(5) %"39", align 4
%"50" = load float, ptr addrspace(5) %"40", align 4
%3 = call float @llvm.trunc.f32(float %"50")
%"49" = freeze float %3
store float %"49", ptr addrspace(5) %"40", align 4
%"51" = load i64, ptr addrspace(5) %"38", align 4
%"52" = load float, ptr addrspace(5) %"39", align 4
%"57" = inttoptr i64 %"51" to ptr
store float %"52", ptr %"57", align 4
%"53" = load i64, ptr addrspace(5) %"38", align 4
%"58" = inttoptr i64 %"53" to ptr
%"33" = getelementptr inbounds i8, ptr %"58", i64 4
%"54" = load float, ptr addrspace(5) %"40", align 4
store float %"54", ptr %"33", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind willreturn
declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #1
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.trunc.f32(float) #2
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind willreturn }
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

View File

@ -0,0 +1,32 @@
define amdgpu_kernel void @cvt_s16_s8(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca i64, align 8, addrspace(5)
%"35" = alloca i32, align 4, addrspace(5)
%"36" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"30"
"30": ; preds = %1
%"37" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"37", ptr addrspace(5) %"33", align 4
%"38" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"38", ptr addrspace(5) %"34", align 4
%"40" = load i64, ptr addrspace(5) %"33", align 4
%"45" = inttoptr i64 %"40" to ptr addrspace(1)
%"39" = load i32, ptr addrspace(1) %"45", align 4
store i32 %"39", ptr addrspace(5) %"36", align 4
%"42" = load i32, ptr addrspace(5) %"36", align 4
%2 = trunc i32 %"42" to i8
%"46" = sext i8 %2 to i16
%"41" = sext i16 %"46" to i32
store i32 %"41", ptr addrspace(5) %"35", align 4
%"43" = load i64, ptr addrspace(5) %"34", align 4
%"44" = load i32, ptr addrspace(5) %"35", align 4
%"48" = inttoptr i64 %"43" to ptr
store i32 %"44", ptr %"48", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -0,0 +1,55 @@
define amdgpu_kernel void @cvt_s32_f32(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
%"37" = alloca i64, align 8, addrspace(5)
%"38" = alloca i64, align 8, addrspace(5)
%"39" = alloca i32, align 4, addrspace(5)
%"40" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"34"
"34": ; preds = %1
%"41" = load i64, ptr addrspace(4) %"35", align 4
store i64 %"41", ptr addrspace(5) %"37", align 4
%"42" = load i64, ptr addrspace(4) %"36", align 4
store i64 %"42", ptr addrspace(5) %"38", align 4
%"44" = load i64, ptr addrspace(5) %"37", align 4
%"56" = inttoptr i64 %"44" to ptr
%"55" = load float, ptr %"56", align 4
%"43" = bitcast float %"55" to i32
store i32 %"43", ptr addrspace(5) %"39", align 4
%"45" = load i64, ptr addrspace(5) %"37", align 4
%"57" = inttoptr i64 %"45" to ptr
%"31" = getelementptr inbounds i8, ptr %"57", i64 4
%"58" = load float, ptr %"31", align 4
%"46" = bitcast float %"58" to i32
store i32 %"46", ptr addrspace(5) %"40", align 4
%"48" = load i32, ptr addrspace(5) %"39", align 4
%"60" = bitcast i32 %"48" to float
%2 = call float @llvm.ceil.f32(float %"60")
%3 = fptosi float %2 to i32
%"59" = freeze i32 %3
store i32 %"59", ptr addrspace(5) %"39", align 4
%"50" = load i32, ptr addrspace(5) %"40", align 4
%"62" = bitcast i32 %"50" to float
%4 = call float @llvm.ceil.f32(float %"62")
%5 = fptosi float %4 to i32
%"61" = freeze i32 %5
store i32 %"61", ptr addrspace(5) %"40", align 4
%"51" = load i64, ptr addrspace(5) %"38", align 4
%"52" = load i32, ptr addrspace(5) %"39", align 4
%"63" = inttoptr i64 %"51" to ptr addrspace(1)
store i32 %"52", ptr addrspace(1) %"63", align 4
%"53" = load i64, ptr addrspace(5) %"38", align 4
%"65" = inttoptr i64 %"53" to ptr addrspace(1)
%"33" = getelementptr inbounds i8, ptr addrspace(1) %"65", i64 4
%"54" = load i32, ptr addrspace(5) %"40", align 4
store i32 %"54", ptr addrspace(1) %"33", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.ceil.f32(float) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

View File

@ -0,0 +1,30 @@
define amdgpu_kernel void @cvt_s64_s32(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca i64, align 8, addrspace(5)
%"35" = alloca i32, align 4, addrspace(5)
%"36" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"30"
"30": ; preds = %1
%"37" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"37", ptr addrspace(5) %"33", align 4
%"38" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"38", ptr addrspace(5) %"34", align 4
%"40" = load i64, ptr addrspace(5) %"33", align 4
%"46" = inttoptr i64 %"40" to ptr
%"45" = load i32, ptr %"46", align 4
store i32 %"45", ptr addrspace(5) %"35", align 4
%"42" = load i32, ptr addrspace(5) %"35", align 4
%"41" = sext i32 %"42" to i64
store i64 %"41", ptr addrspace(5) %"36", align 4
%"43" = load i64, ptr addrspace(5) %"34", align 4
%"44" = load i64, ptr addrspace(5) %"36", align 4
%"47" = inttoptr i64 %"43" to ptr
store i64 %"44", ptr %"47", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -0,0 +1,41 @@
define amdgpu_kernel void @cvt_sat_s_u(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
%"34" = alloca i64, align 8, addrspace(5)
%"35" = alloca i64, align 8, addrspace(5)
%"36" = alloca i32, align 4, addrspace(5)
%"37" = alloca i32, align 4, addrspace(5)
%"38" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"31"
"31": ; preds = %1
%"39" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"39", ptr addrspace(5) %"34", align 4
%"40" = load i64, ptr addrspace(4) %"33", align 4
store i64 %"40", ptr addrspace(5) %"35", align 4
%"42" = load i64, ptr addrspace(5) %"34", align 4
%"49" = inttoptr i64 %"42" to ptr
%"41" = load i32, ptr %"49", align 4
store i32 %"41", ptr addrspace(5) %"36", align 4
%"44" = load i32, ptr addrspace(5) %"36", align 4
%2 = call i32 @llvm.smax.i32(i32 %"44", i32 0)
%3 = call i32 @llvm.umin.i32(i32 %2, i32 -1)
store i32 %3, ptr addrspace(5) %"37", align 4
%"46" = load i32, ptr addrspace(5) %"37", align 4
store i32 %"46", ptr addrspace(5) %"38", align 4
%"47" = load i64, ptr addrspace(5) %"35", align 4
%"48" = load i32, ptr addrspace(5) %"38", align 4
%"50" = inttoptr i64 %"47" to ptr
store i32 %"48", ptr %"50", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.smax.i32(i32, i32) #1
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.umin.i32(i32, i32) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

34
ptx/src/test/ll/cvta.ll Normal file
View File

@ -0,0 +1,34 @@
define amdgpu_kernel void @cvta(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
%"32" = alloca i64, align 8, addrspace(5)
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"29"
"29": ; preds = %1
%"35" = load i64, ptr addrspace(4) %"30", align 4
store i64 %"35", ptr addrspace(5) %"32", align 4
%"36" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"36", ptr addrspace(5) %"33", align 4
%"38" = load i64, ptr addrspace(5) %"32", align 4
%2 = inttoptr i64 %"38" to ptr
%"45" = addrspacecast ptr %2 to ptr addrspace(1)
store ptr addrspace(1) %"45", ptr addrspace(5) %"32", align 8
%"40" = load i64, ptr addrspace(5) %"33", align 4
%3 = inttoptr i64 %"40" to ptr
%"47" = addrspacecast ptr %3 to ptr addrspace(1)
store ptr addrspace(1) %"47", ptr addrspace(5) %"33", align 8
%"42" = load i64, ptr addrspace(5) %"32", align 4
%"49" = inttoptr i64 %"42" to ptr addrspace(1)
%"41" = load float, ptr addrspace(1) %"49", align 4
store float %"41", ptr addrspace(5) %"34", align 4
%"43" = load i64, ptr addrspace(5) %"33", align 4
%"44" = load float, ptr addrspace(5) %"34", align 4
%"50" = inttoptr i64 %"43" to ptr addrspace(1)
store float %"44", ptr addrspace(1) %"50", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -0,0 +1,36 @@
define amdgpu_kernel void @div_approx(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
%"35" = alloca i64, align 8, addrspace(5)
%"36" = alloca i64, align 8, addrspace(5)
%"37" = alloca float, align 4, addrspace(5)
%"38" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"32"
"32": ; preds = %1
%"39" = load i64, ptr addrspace(4) %"33", align 4
store i64 %"39", ptr addrspace(5) %"35", align 4
%"40" = load i64, ptr addrspace(4) %"34", align 4
store i64 %"40", ptr addrspace(5) %"36", align 4
%"42" = load i64, ptr addrspace(5) %"35", align 4
%"50" = inttoptr i64 %"42" to ptr
%"41" = load float, ptr %"50", align 4
store float %"41", ptr addrspace(5) %"37", align 4
%"43" = load i64, ptr addrspace(5) %"35", align 4
%"51" = inttoptr i64 %"43" to ptr
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
%"44" = load float, ptr %"31", align 4
store float %"44", ptr addrspace(5) %"38", align 4
%"46" = load float, ptr addrspace(5) %"37", align 4
%"47" = load float, ptr addrspace(5) %"38", align 4
%"45" = fdiv arcp afn float %"46", %"47"
store float %"45", ptr addrspace(5) %"37", align 4
%"48" = load i64, ptr addrspace(5) %"36", align 4
%"49" = load float, ptr addrspace(5) %"37", align 4
%"52" = inttoptr i64 %"48" to ptr
store float %"49", ptr %"52", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }

33
ptx/src/test/ll/ex2.ll Normal file
View File

@ -0,0 +1,33 @@
define amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
%"32" = alloca i64, align 8, addrspace(5)
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"29"
"29": ; preds = %1
%"35" = load i64, ptr addrspace(4) %"30", align 4
store i64 %"35", ptr addrspace(5) %"32", align 4
%"36" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"36", ptr addrspace(5) %"33", align 4
%"38" = load i64, ptr addrspace(5) %"32", align 4
%"43" = inttoptr i64 %"38" to ptr
%"37" = load float, ptr %"43", align 4
store float %"37", ptr addrspace(5) %"34", align 4
%"40" = load float, ptr addrspace(5) %"34", align 4
%"39" = call float @llvm.amdgcn.exp2.f32(float %"40")
store float %"39", ptr addrspace(5) %"34", align 4
%"41" = load i64, ptr addrspace(5) %"33", align 4
%"42" = load float, ptr addrspace(5) %"34", align 4
%"44" = inttoptr i64 %"41" to ptr
store float %"42", ptr %"44", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.amdgcn.exp2.f32(float) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

View File

@ -0,0 +1,32 @@
@shared_mem = external addrspace(3) global [0 x i32]
define amdgpu_kernel void @extern_shared(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca i64, align 8, addrspace(5)
%"35" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"30"
"30": ; preds = %1
%"36" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"36", ptr addrspace(5) %"33", align 4
%"37" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"37", ptr addrspace(5) %"34", align 4
%"39" = load i64, ptr addrspace(5) %"33", align 4
%"44" = inttoptr i64 %"39" to ptr addrspace(1)
%"38" = load i64, ptr addrspace(1) %"44", align 4
store i64 %"38", ptr addrspace(5) %"35", align 4
%"40" = load i64, ptr addrspace(5) %"35", align 4
store i64 %"40", ptr addrspace(3) @shared_mem, align 4
%"41" = load i64, ptr addrspace(3) @shared_mem, align 4
store i64 %"41", ptr addrspace(5) %"35", align 4
%"42" = load i64, ptr addrspace(5) %"34", align 4
%"43" = load i64, ptr addrspace(5) %"35", align 4
%"47" = inttoptr i64 %"42" to ptr addrspace(1)
store i64 %"43", ptr addrspace(1) %"47", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -0,0 +1,55 @@
@shared_mem = external addrspace(3) global [0 x i32], align 4
define void @incr_shared_2_global() #0 {
%"36" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"33"
"33": ; preds = %1
%"37" = load i64, ptr addrspace(3) @shared_mem, align 4
store i64 %"37", ptr addrspace(5) %"36", align 4
%"39" = load i64, ptr addrspace(5) %"36", align 4
%"38" = add i64 %"39", 2
store i64 %"38", ptr addrspace(5) %"36", align 4
%"40" = load i64, ptr addrspace(5) %"36", align 4
store i64 %"40", ptr addrspace(3) @shared_mem, align 4
ret void
}
define amdgpu_kernel void @extern_shared_call(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #1 {
%"43" = alloca i64, align 8, addrspace(5)
%"44" = alloca i64, align 8, addrspace(5)
%"45" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"34"
"34": ; preds = %1
%"46" = load i64, ptr addrspace(4) %"41", align 4
store i64 %"46", ptr addrspace(5) %"43", align 4
%"47" = load i64, ptr addrspace(4) %"42", align 4
store i64 %"47", ptr addrspace(5) %"44", align 4
%"49" = load i64, ptr addrspace(5) %"43", align 4
%"56" = inttoptr i64 %"49" to ptr addrspace(1)
%"48" = load i64, ptr addrspace(1) %"56", align 4
store i64 %"48", ptr addrspace(5) %"45", align 4
%"50" = load i64, ptr addrspace(5) %"45", align 4
store i64 %"50", ptr addrspace(3) @shared_mem, align 4
call void @incr_shared_2_global()
br label %"35"
"35": ; preds = %"34"
%"51" = load i64, ptr addrspace(3) @shared_mem, align 4
store i64 %"51", ptr addrspace(5) %"45", align 4
%"52" = load i64, ptr addrspace(5) %"44", align 4
%"53" = load i64, ptr addrspace(5) %"45", align 4
%"59" = inttoptr i64 %"52" to ptr addrspace(1)
store i64 %"53", ptr addrspace(1) %"59", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

47
ptx/src/test/ll/fma.ll Normal file
View File

@ -0,0 +1,47 @@
define amdgpu_kernel void @fma(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
%"38" = alloca i64, align 8, addrspace(5)
%"39" = alloca i64, align 8, addrspace(5)
%"40" = alloca float, align 4, addrspace(5)
%"41" = alloca float, align 4, addrspace(5)
%"42" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"35"
"35": ; preds = %1
%"43" = load i64, ptr addrspace(4) %"36", align 4
store i64 %"43", ptr addrspace(5) %"38", align 4
%"44" = load i64, ptr addrspace(4) %"37", align 4
store i64 %"44", ptr addrspace(5) %"39", align 4
%"46" = load i64, ptr addrspace(5) %"38", align 4
%"57" = inttoptr i64 %"46" to ptr
%"45" = load float, ptr %"57", align 4
store float %"45", ptr addrspace(5) %"40", align 4
%"47" = load i64, ptr addrspace(5) %"38", align 4
%"58" = inttoptr i64 %"47" to ptr
%"32" = getelementptr inbounds i8, ptr %"58", i64 4
%"48" = load float, ptr %"32", align 4
store float %"48", ptr addrspace(5) %"41", align 4
%"49" = load i64, ptr addrspace(5) %"38", align 4
%"59" = inttoptr i64 %"49" to ptr
%"34" = getelementptr inbounds i8, ptr %"59", i64 8
%"50" = load float, ptr %"34", align 4
store float %"50", ptr addrspace(5) %"42", align 4
%"52" = load float, ptr addrspace(5) %"40", align 4
%"53" = load float, ptr addrspace(5) %"41", align 4
%"54" = load float, ptr addrspace(5) %"42", align 4
%"51" = call float @llvm.fma.f32(float %"52", float %"53", float %"54")
store float %"51", ptr addrspace(5) %"40", align 4
%"55" = load i64, ptr addrspace(5) %"39", align 4
%"56" = load float, ptr addrspace(5) %"40", align 4
%"60" = inttoptr i64 %"55" to ptr
store float %"56", ptr %"60", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.fma.f32(float, float, float) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

View File

@ -0,0 +1,27 @@
@foobar = addrspace(1) global [4 x i32] [i32 1, i32 0, i32 0, i32 0]
define amdgpu_kernel void @global_array(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca i64, align 8, addrspace(5)
%"35" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"30"
"30": ; preds = %1
store i64 ptrtoint (ptr addrspace(1) @foobar to i64), ptr addrspace(5) %"33", align 4
%"37" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"37", ptr addrspace(5) %"34", align 4
%"39" = load i64, ptr addrspace(5) %"33", align 4
%"43" = inttoptr i64 %"39" to ptr addrspace(1)
%"38" = load i32, ptr addrspace(1) %"43", align 4
store i32 %"38", ptr addrspace(5) %"35", align 4
%"40" = load i64, ptr addrspace(5) %"34", align 4
%"41" = load i32, ptr addrspace(5) %"35", align 4
%"44" = inttoptr i64 %"40" to ptr addrspace(1)
store i32 %"41", ptr addrspace(1) %"44", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

26
ptx/src/test/ll/ld_st.ll Normal file
View File

@ -0,0 +1,26 @@
define amdgpu_kernel void @ld_st(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
%"32" = alloca i64, align 8, addrspace(5)
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"29"
"29": ; preds = %1
%"35" = load i64, ptr addrspace(4) %"30", align 4
store i64 %"35", ptr addrspace(5) %"32", align 4
%"36" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"36", ptr addrspace(5) %"33", align 4
%"38" = load i64, ptr addrspace(5) %"32", align 4
%"41" = inttoptr i64 %"38" to ptr
%"37" = load i64, ptr %"41", align 4
store i64 %"37", ptr addrspace(5) %"34", align 4
%"39" = load i64, ptr addrspace(5) %"33", align 4
%"40" = load i64, ptr addrspace(5) %"34", align 4
%"42" = inttoptr i64 %"39" to ptr
store i64 %"40", ptr %"42", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -0,0 +1,31 @@
define amdgpu_kernel void @ld_st_implicit(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca i64, align 8, addrspace(5)
%"35" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"30"
"30": ; preds = %1
%"36" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"36", ptr addrspace(5) %"33", align 4
%"37" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"37", ptr addrspace(5) %"34", align 4
store i64 81985529216486895, ptr addrspace(5) %"35", align 4
%"40" = load i64, ptr addrspace(5) %"33", align 4
%"44" = inttoptr i64 %"40" to ptr addrspace(1)
%"43" = load float, ptr addrspace(1) %"44", align 4
%2 = bitcast float %"43" to i32
%"39" = zext i32 %2 to i64
store i64 %"39", ptr addrspace(5) %"35", align 4
%"41" = load i64, ptr addrspace(5) %"34", align 4
%"42" = load i64, ptr addrspace(5) %"35", align 4
%"45" = inttoptr i64 %"41" to ptr addrspace(1)
%3 = trunc i64 %"42" to i32
%"46" = bitcast i32 %3 to float
store float %"46", ptr addrspace(1) %"45", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -0,0 +1,37 @@
define amdgpu_kernel void @ld_st_offset(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
%"37" = alloca i64, align 8, addrspace(5)
%"38" = alloca i64, align 8, addrspace(5)
%"39" = alloca i32, align 4, addrspace(5)
%"40" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"34"
"34": ; preds = %1
%"41" = load i64, ptr addrspace(4) %"35", align 4
store i64 %"41", ptr addrspace(5) %"37", align 4
%"42" = load i64, ptr addrspace(4) %"36", align 4
store i64 %"42", ptr addrspace(5) %"38", align 4
%"44" = load i64, ptr addrspace(5) %"37", align 4
%"51" = inttoptr i64 %"44" to ptr
%"43" = load i32, ptr %"51", align 4
store i32 %"43", ptr addrspace(5) %"39", align 4
%"45" = load i64, ptr addrspace(5) %"37", align 4
%"52" = inttoptr i64 %"45" to ptr
%"31" = getelementptr inbounds i8, ptr %"52", i64 4
%"46" = load i32, ptr %"31", align 4
store i32 %"46", ptr addrspace(5) %"40", align 4
%"47" = load i64, ptr addrspace(5) %"38", align 4
%"48" = load i32, ptr addrspace(5) %"40", align 4
%"53" = inttoptr i64 %"47" to ptr
store i32 %"48", ptr %"53", align 4
%"49" = load i64, ptr addrspace(5) %"38", align 4
%"54" = inttoptr i64 %"49" to ptr
%"33" = getelementptr inbounds i8, ptr %"54", i64 4
%"50" = load i32, ptr addrspace(5) %"39", align 4
store i32 %"50", ptr %"33", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

33
ptx/src/test/ll/lg2.ll Normal file
View File

@ -0,0 +1,33 @@
define amdgpu_kernel void @lg2(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
%"32" = alloca i64, align 8, addrspace(5)
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"29"
"29": ; preds = %1
%"35" = load i64, ptr addrspace(4) %"30", align 4
store i64 %"35", ptr addrspace(5) %"32", align 4
%"36" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"36", ptr addrspace(5) %"33", align 4
%"38" = load i64, ptr addrspace(5) %"32", align 4
%"43" = inttoptr i64 %"38" to ptr
%"37" = load float, ptr %"43", align 4
store float %"37", ptr addrspace(5) %"34", align 4
%"40" = load float, ptr addrspace(5) %"34", align 4
%"39" = call float @llvm.amdgcn.log.f32(float %"40")
store float %"39", ptr addrspace(5) %"34", align 4
%"41" = load i64, ptr addrspace(5) %"33", align 4
%"42" = load float, ptr addrspace(5) %"34", align 4
%"44" = inttoptr i64 %"41" to ptr
store float %"42", ptr %"44", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.amdgcn.log.f32(float) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

View File

@ -0,0 +1,27 @@
define amdgpu_kernel void @local_align(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
%"10" = alloca [8 x i8], align 8, addrspace(5)
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca i64, align 8, addrspace(5)
%"35" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"30"
"30": ; preds = %1
%"36" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"36", ptr addrspace(5) %"33", align 4
%"37" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"37", ptr addrspace(5) %"34", align 4
%"39" = load i64, ptr addrspace(5) %"33", align 4
%"42" = inttoptr i64 %"39" to ptr
%"38" = load i64, ptr %"42", align 4
store i64 %"38", ptr addrspace(5) %"35", align 4
%"40" = load i64, ptr addrspace(5) %"34", align 4
%"41" = load i64, ptr addrspace(5) %"35", align 4
%"43" = inttoptr i64 %"40" to ptr
store i64 %"41", ptr %"43", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -0,0 +1,55 @@
define amdgpu_kernel void @mad_s32(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 {
%"43" = alloca i64, align 8, addrspace(5)
%"44" = alloca i64, align 8, addrspace(5)
%"45" = alloca i32, align 4, addrspace(5)
%"46" = alloca i32, align 4, addrspace(5)
%"47" = alloca i32, align 4, addrspace(5)
%"48" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"40"
"40": ; preds = %1
%"49" = load i64, ptr addrspace(4) %"41", align 4
store i64 %"49", ptr addrspace(5) %"43", align 4
%"50" = load i64, ptr addrspace(4) %"42", align 4
store i64 %"50", ptr addrspace(5) %"44", align 4
%"52" = load i64, ptr addrspace(5) %"43", align 4
%"67" = inttoptr i64 %"52" to ptr
%"51" = load i32, ptr %"67", align 4
store i32 %"51", ptr addrspace(5) %"46", align 4
%"53" = load i64, ptr addrspace(5) %"43", align 4
%"68" = inttoptr i64 %"53" to ptr
%"33" = getelementptr inbounds i8, ptr %"68", i64 4
%"54" = load i32, ptr %"33", align 4
store i32 %"54", ptr addrspace(5) %"47", align 4
%"55" = load i64, ptr addrspace(5) %"43", align 4
%"69" = inttoptr i64 %"55" to ptr
%"35" = getelementptr inbounds i8, ptr %"69", i64 8
%"56" = load i32, ptr %"35", align 4
store i32 %"56", ptr addrspace(5) %"48", align 4
%"58" = load i32, ptr addrspace(5) %"46", align 4
%"59" = load i32, ptr addrspace(5) %"47", align 4
%"60" = load i32, ptr addrspace(5) %"48", align 4
%2 = mul i32 %"58", %"59"
%"57" = add i32 %2, %"60"
store i32 %"57", ptr addrspace(5) %"45", align 4
%"61" = load i64, ptr addrspace(5) %"44", align 4
%"62" = load i32, ptr addrspace(5) %"45", align 4
%"70" = inttoptr i64 %"61" to ptr
store i32 %"62", ptr %"70", align 4
%"63" = load i64, ptr addrspace(5) %"44", align 4
%"71" = inttoptr i64 %"63" to ptr
%"37" = getelementptr inbounds i8, ptr %"71", i64 4
%"64" = load i32, ptr addrspace(5) %"45", align 4
store i32 %"64", ptr %"37", align 4
%"65" = load i64, ptr addrspace(5) %"44", align 4
%"72" = inttoptr i64 %"65" to ptr
%"39" = getelementptr inbounds i8, ptr %"72", i64 8
%"66" = load i32, ptr addrspace(5) %"45", align 4
store i32 %"66", ptr %"39", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -0,0 +1,33 @@
define amdgpu_kernel void @malformed_label(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
%"36" = alloca i64, align 8, addrspace(5)
%"37" = alloca i64, align 8, addrspace(5)
%"38" = alloca i64, align 8, addrspace(5)
%"39" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"32"
"32": ; preds = %1
%"40" = load i64, ptr addrspace(4) %"34", align 4
store i64 %"40", ptr addrspace(5) %"36", align 4
%"41" = load i64, ptr addrspace(4) %"35", align 4
store i64 %"41", ptr addrspace(5) %"37", align 4
br label %"10"
"10": ; preds = %"32"
%"43" = load i64, ptr addrspace(5) %"36", align 4
%"48" = inttoptr i64 %"43" to ptr
%"42" = load i64, ptr %"48", align 4
store i64 %"42", ptr addrspace(5) %"38", align 4
%"45" = load i64, ptr addrspace(5) %"38", align 4
%"44" = add i64 %"45", 1
store i64 %"44", ptr addrspace(5) %"39", align 4
%"46" = load i64, ptr addrspace(5) %"37", align 4
%"47" = load i64, ptr addrspace(5) %"39", align 4
%"49" = inttoptr i64 %"46" to ptr
store i64 %"47", ptr %"49", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

40
ptx/src/test/ll/max.ll Normal file
View File

@ -0,0 +1,40 @@
define amdgpu_kernel void @max(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
%"35" = alloca i64, align 8, addrspace(5)
%"36" = alloca i64, align 8, addrspace(5)
%"37" = alloca i32, align 4, addrspace(5)
%"38" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"32"
"32": ; preds = %1
%"39" = load i64, ptr addrspace(4) %"33", align 4
store i64 %"39", ptr addrspace(5) %"35", align 4
%"40" = load i64, ptr addrspace(4) %"34", align 4
store i64 %"40", ptr addrspace(5) %"36", align 4
%"42" = load i64, ptr addrspace(5) %"35", align 4
%"50" = inttoptr i64 %"42" to ptr
%"41" = load i32, ptr %"50", align 4
store i32 %"41", ptr addrspace(5) %"37", align 4
%"43" = load i64, ptr addrspace(5) %"35", align 4
%"51" = inttoptr i64 %"43" to ptr
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
%"44" = load i32, ptr %"31", align 4
store i32 %"44", ptr addrspace(5) %"38", align 4
%"46" = load i32, ptr addrspace(5) %"37", align 4
%"47" = load i32, ptr addrspace(5) %"38", align 4
%"45" = call i32 @llvm.smax.i32(i32 %"46", i32 %"47")
store i32 %"45", ptr addrspace(5) %"37", align 4
%"48" = load i64, ptr addrspace(5) %"36", align 4
%"49" = load i32, ptr addrspace(5) %"37", align 4
%"52" = inttoptr i64 %"48" to ptr
store i32 %"49", ptr %"52", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.smax.i32(i32, i32) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

27
ptx/src/test/ll/membar.ll Normal file
View File

@ -0,0 +1,27 @@
define amdgpu_kernel void @membar(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
%"32" = alloca i64, align 8, addrspace(5)
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"29"
"29": ; preds = %1
%"35" = load i64, ptr addrspace(4) %"30", align 4
store i64 %"35", ptr addrspace(5) %"32", align 4
%"36" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"36", ptr addrspace(5) %"33", align 4
%"38" = load i64, ptr addrspace(5) %"32", align 4
%"42" = inttoptr i64 %"38" to ptr
%"41" = load i32, ptr %"42", align 4
store i32 %"41", ptr addrspace(5) %"34", align 4
fence seq_cst
%"39" = load i64, ptr addrspace(5) %"33", align 4
%"40" = load i32, ptr addrspace(5) %"34", align 4
%"43" = inttoptr i64 %"39" to ptr
store i32 %"40", ptr %"43", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

40
ptx/src/test/ll/min.ll Normal file
View File

@ -0,0 +1,40 @@
define amdgpu_kernel void @min(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
%"35" = alloca i64, align 8, addrspace(5)
%"36" = alloca i64, align 8, addrspace(5)
%"37" = alloca i32, align 4, addrspace(5)
%"38" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"32"
"32": ; preds = %1
%"39" = load i64, ptr addrspace(4) %"33", align 4
store i64 %"39", ptr addrspace(5) %"35", align 4
%"40" = load i64, ptr addrspace(4) %"34", align 4
store i64 %"40", ptr addrspace(5) %"36", align 4
%"42" = load i64, ptr addrspace(5) %"35", align 4
%"50" = inttoptr i64 %"42" to ptr
%"41" = load i32, ptr %"50", align 4
store i32 %"41", ptr addrspace(5) %"37", align 4
%"43" = load i64, ptr addrspace(5) %"35", align 4
%"51" = inttoptr i64 %"43" to ptr
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
%"44" = load i32, ptr %"31", align 4
store i32 %"44", ptr addrspace(5) %"38", align 4
%"46" = load i32, ptr addrspace(5) %"37", align 4
%"47" = load i32, ptr addrspace(5) %"38", align 4
%"45" = call i32 @llvm.smin.i32(i32 %"46", i32 %"47")
store i32 %"45", ptr addrspace(5) %"37", align 4
%"48" = load i64, ptr addrspace(5) %"36", align 4
%"49" = load i32, ptr addrspace(5) %"37", align 4
%"52" = inttoptr i64 %"48" to ptr
store i32 %"49", ptr %"52", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.smin.i32(i32, i32) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

29
ptx/src/test/ll/mov.ll Normal file
View File

@ -0,0 +1,29 @@
define amdgpu_kernel void @mov(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca i64, align 8, addrspace(5)
%"35" = alloca i64, align 8, addrspace(5)
%"36" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"30"
"30": ; preds = %1
%"37" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"37", ptr addrspace(5) %"33", align 4
%"38" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"38", ptr addrspace(5) %"34", align 4
%"40" = load i64, ptr addrspace(5) %"33", align 4
%"45" = inttoptr i64 %"40" to ptr
%"39" = load i64, ptr %"45", align 4
store i64 %"39", ptr addrspace(5) %"35", align 4
%"42" = load i64, ptr addrspace(5) %"35", align 4
store i64 %"42", ptr addrspace(5) %"36", align 4
%"43" = load i64, ptr addrspace(5) %"34", align 4
%"44" = load i64, ptr addrspace(5) %"36", align 4
%"46" = inttoptr i64 %"43" to ptr
store i64 %"44", ptr %"46", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -0,0 +1,15 @@
define amdgpu_kernel void @mov_address(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #0 {
%"10" = alloca [8 x i8], align 1, addrspace(5)
%"31" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"28"
"28": ; preds = %1
%"33" = ptrtoint ptr addrspace(5) %"10" to i64
store i64 %"33", ptr addrspace(5) %"31", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -0,0 +1,46 @@
define amdgpu_kernel void @mul24_hi_s32(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
%"34" = alloca i64, align 8, addrspace(5)
%"35" = alloca i64, align 8, addrspace(5)
%"36" = alloca i32, align 4, addrspace(5)
%"37" = alloca i32, align 4, addrspace(5)
%"38" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"31"
"31": ; preds = %1
%"39" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"39", ptr addrspace(5) %"34", align 4
%"40" = load i64, ptr addrspace(4) %"33", align 4
store i64 %"40", ptr addrspace(5) %"35", align 4
%"42" = load i64, ptr addrspace(5) %"34", align 4
%"50" = inttoptr i64 %"42" to ptr
%"41" = load i32, ptr %"50", align 4
store i32 %"41", ptr addrspace(5) %"36", align 4
%"44" = load i32, ptr addrspace(5) %"36", align 4
%"43" = sub i32 0, %"44"
store i32 %"43", ptr addrspace(5) %"37", align 4
%"46" = load i32, ptr addrspace(5) %"37", align 4
%"47" = load i32, ptr addrspace(5) %"36", align 4
%2 = call i32 @llvm.amdgcn.mul.i24(i32 %"46", i32 %"47")
%3 = call i32 @llvm.amdgcn.mulhi.i24(i32 %"46", i32 %"47")
%4 = lshr i32 %2, 16
%5 = shl i32 %3, 16
%"45" = or i32 %4, %5
store i32 %"45", ptr addrspace(5) %"38", align 4
%"48" = load i64, ptr addrspace(5) %"35", align 4
%"49" = load i32, ptr addrspace(5) %"38", align 4
%"51" = inttoptr i64 %"48" to ptr
store i32 %"49", ptr %"51", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.amdgcn.mul.i24(i32, i32) #1
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.amdgcn.mulhi.i24(i32, i32) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

View File

@ -0,0 +1,42 @@
define amdgpu_kernel void @mul24_hi_u32(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca i64, align 8, addrspace(5)
%"35" = alloca i32, align 4, addrspace(5)
%"36" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"30"
"30": ; preds = %1
%"37" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"37", ptr addrspace(5) %"33", align 4
%"38" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"38", ptr addrspace(5) %"34", align 4
%"40" = load i64, ptr addrspace(5) %"33", align 4
%"46" = inttoptr i64 %"40" to ptr
%"39" = load i32, ptr %"46", align 4
store i32 %"39", ptr addrspace(5) %"35", align 4
%"42" = load i32, ptr addrspace(5) %"35", align 4
%"43" = load i32, ptr addrspace(5) %"35", align 4
%2 = call i32 @llvm.amdgcn.mul.u24(i32 %"42", i32 %"43")
%3 = call i32 @llvm.amdgcn.mulhi.u24(i32 %"42", i32 %"43")
%4 = lshr i32 %2, 16
%5 = shl i32 %3, 16
%"41" = or i32 %4, %5
store i32 %"41", ptr addrspace(5) %"36", align 4
%"44" = load i64, ptr addrspace(5) %"34", align 4
%"45" = load i32, ptr addrspace(5) %"36", align 4
%"47" = inttoptr i64 %"44" to ptr
store i32 %"45", ptr %"47", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.amdgcn.mul.u24(i32, i32) #1
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.amdgcn.mulhi.u24(i32, i32) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

View File

@ -0,0 +1,39 @@
define amdgpu_kernel void @mul24_lo_s32(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
%"34" = alloca i64, align 8, addrspace(5)
%"35" = alloca i64, align 8, addrspace(5)
%"36" = alloca i32, align 4, addrspace(5)
%"37" = alloca i32, align 4, addrspace(5)
%"38" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"31"
"31": ; preds = %1
%"39" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"39", ptr addrspace(5) %"34", align 4
%"40" = load i64, ptr addrspace(4) %"33", align 4
store i64 %"40", ptr addrspace(5) %"35", align 4
%"42" = load i64, ptr addrspace(5) %"34", align 4
%"50" = inttoptr i64 %"42" to ptr
%"41" = load i32, ptr %"50", align 4
store i32 %"41", ptr addrspace(5) %"36", align 4
%"44" = load i32, ptr addrspace(5) %"36", align 4
%"43" = sub i32 0, %"44"
store i32 %"43", ptr addrspace(5) %"37", align 4
%"46" = load i32, ptr addrspace(5) %"37", align 4
%"47" = load i32, ptr addrspace(5) %"36", align 4
%"45" = call i32 @llvm.amdgcn.mul.i24(i32 %"46", i32 %"47")
store i32 %"45", ptr addrspace(5) %"38", align 4
%"48" = load i64, ptr addrspace(5) %"35", align 4
%"49" = load i32, ptr addrspace(5) %"38", align 4
%"51" = inttoptr i64 %"48" to ptr
store i32 %"49", ptr %"51", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.amdgcn.mul.i24(i32, i32) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

View File

@ -0,0 +1,35 @@
define amdgpu_kernel void @mul24_lo_u32(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca i64, align 8, addrspace(5)
%"35" = alloca i32, align 4, addrspace(5)
%"36" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"30"
"30": ; preds = %1
%"37" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"37", ptr addrspace(5) %"33", align 4
%"38" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"38", ptr addrspace(5) %"34", align 4
%"40" = load i64, ptr addrspace(5) %"33", align 4
%"46" = inttoptr i64 %"40" to ptr
%"39" = load i32, ptr %"46", align 4
store i32 %"39", ptr addrspace(5) %"35", align 4
%"42" = load i32, ptr addrspace(5) %"35", align 4
%"43" = load i32, ptr addrspace(5) %"35", align 4
%"41" = call i32 @llvm.amdgcn.mul.u24(i32 %"42", i32 %"43")
store i32 %"41", ptr addrspace(5) %"36", align 4
%"44" = load i64, ptr addrspace(5) %"34", align 4
%"45" = load i32, ptr addrspace(5) %"36", align 4
%"47" = inttoptr i64 %"44" to ptr
store i32 %"45", ptr %"47", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.amdgcn.mul.u24(i32, i32) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

View File

@ -0,0 +1,36 @@
define amdgpu_kernel void @mul_ftz(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
%"35" = alloca i64, align 8, addrspace(5)
%"36" = alloca i64, align 8, addrspace(5)
%"37" = alloca float, align 4, addrspace(5)
%"38" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"32"
"32": ; preds = %1
%"39" = load i64, ptr addrspace(4) %"33", align 4
store i64 %"39", ptr addrspace(5) %"35", align 4
%"40" = load i64, ptr addrspace(4) %"34", align 4
store i64 %"40", ptr addrspace(5) %"36", align 4
%"42" = load i64, ptr addrspace(5) %"35", align 4
%"50" = inttoptr i64 %"42" to ptr
%"41" = load float, ptr %"50", align 4
store float %"41", ptr addrspace(5) %"37", align 4
%"43" = load i64, ptr addrspace(5) %"35", align 4
%"51" = inttoptr i64 %"43" to ptr
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
%"44" = load float, ptr %"31", align 4
store float %"44", ptr addrspace(5) %"38", align 4
%"46" = load float, ptr addrspace(5) %"37", align 4
%"47" = load float, ptr addrspace(5) %"38", align 4
%"45" = fmul float %"46", %"47"
store float %"45", ptr addrspace(5) %"37", align 4
%"48" = load i64, ptr addrspace(5) %"36", align 4
%"49" = load float, ptr addrspace(5) %"37", align 4
%"52" = inttoptr i64 %"48" to ptr
store float %"49", ptr %"52", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

33
ptx/src/test/ll/mul_hi.ll Normal file
View File

@ -0,0 +1,33 @@
define amdgpu_kernel void @mul_hi(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
%"34" = alloca i64, align 8, addrspace(5)
%"35" = alloca i64, align 8, addrspace(5)
%"36" = alloca i64, align 8, addrspace(5)
%"37" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"31"
"31": ; preds = %1
%"38" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"38", ptr addrspace(5) %"34", align 4
%"39" = load i64, ptr addrspace(4) %"33", align 4
store i64 %"39", ptr addrspace(5) %"35", align 4
%"41" = load i64, ptr addrspace(5) %"34", align 4
%"46" = inttoptr i64 %"41" to ptr
%"40" = load i64, ptr %"46", align 4
store i64 %"40", ptr addrspace(5) %"36", align 4
%"43" = load i64, ptr addrspace(5) %"36", align 4
%2 = zext i64 %"43" to i128
%3 = mul i128 %2, 2
%4 = lshr i128 %3, 64
%"42" = trunc i128 %4 to i64
store i64 %"42", ptr addrspace(5) %"37", align 4
%"44" = load i64, ptr addrspace(5) %"35", align 4
%"45" = load i64, ptr addrspace(5) %"37", align 4
%"47" = inttoptr i64 %"44" to ptr
store i64 %"45", ptr %"47", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

30
ptx/src/test/ll/mul_lo.ll Normal file
View File

@ -0,0 +1,30 @@
define amdgpu_kernel void @mul_lo(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
%"34" = alloca i64, align 8, addrspace(5)
%"35" = alloca i64, align 8, addrspace(5)
%"36" = alloca i64, align 8, addrspace(5)
%"37" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"31"
"31": ; preds = %1
%"38" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"38", ptr addrspace(5) %"34", align 4
%"39" = load i64, ptr addrspace(4) %"33", align 4
store i64 %"39", ptr addrspace(5) %"35", align 4
%"41" = load i64, ptr addrspace(5) %"34", align 4
%"46" = inttoptr i64 %"41" to ptr
%"40" = load i64, ptr %"46", align 4
store i64 %"40", ptr addrspace(5) %"36", align 4
%"43" = load i64, ptr addrspace(5) %"36", align 4
%"42" = mul i64 %"43", 2
store i64 %"42", ptr addrspace(5) %"37", align 4
%"44" = load i64, ptr addrspace(5) %"35", align 4
%"45" = load i64, ptr addrspace(5) %"37", align 4
%"47" = inttoptr i64 %"44" to ptr
store i64 %"45", ptr %"47", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -0,0 +1,36 @@
define amdgpu_kernel void @mul_non_ftz(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
%"35" = alloca i64, align 8, addrspace(5)
%"36" = alloca i64, align 8, addrspace(5)
%"37" = alloca float, align 4, addrspace(5)
%"38" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"32"
"32": ; preds = %1
%"39" = load i64, ptr addrspace(4) %"33", align 4
store i64 %"39", ptr addrspace(5) %"35", align 4
%"40" = load i64, ptr addrspace(4) %"34", align 4
store i64 %"40", ptr addrspace(5) %"36", align 4
%"42" = load i64, ptr addrspace(5) %"35", align 4
%"50" = inttoptr i64 %"42" to ptr
%"41" = load float, ptr %"50", align 4
store float %"41", ptr addrspace(5) %"37", align 4
%"43" = load i64, ptr addrspace(5) %"35", align 4
%"51" = inttoptr i64 %"43" to ptr
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
%"44" = load float, ptr %"31", align 4
store float %"44", ptr addrspace(5) %"38", align 4
%"46" = load float, ptr addrspace(5) %"37", align 4
%"47" = load float, ptr addrspace(5) %"38", align 4
%"45" = fmul float %"46", %"47"
store float %"45", ptr addrspace(5) %"37", align 4
%"48" = load i64, ptr addrspace(5) %"36", align 4
%"49" = load float, ptr addrspace(5) %"37", align 4
%"52" = inttoptr i64 %"48" to ptr
store float %"49", ptr %"52", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -0,0 +1,39 @@
define amdgpu_kernel void @mul_wide(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
%"36" = alloca i64, align 8, addrspace(5)
%"37" = alloca i64, align 8, addrspace(5)
%"38" = alloca i32, align 4, addrspace(5)
%"39" = alloca i32, align 4, addrspace(5)
%"40" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"33"
"33": ; preds = %1
%"41" = load i64, ptr addrspace(4) %"34", align 4
store i64 %"41", ptr addrspace(5) %"36", align 4
%"42" = load i64, ptr addrspace(4) %"35", align 4
store i64 %"42", ptr addrspace(5) %"37", align 4
%"44" = load i64, ptr addrspace(5) %"36", align 4
%"52" = inttoptr i64 %"44" to ptr addrspace(1)
%"43" = load i32, ptr addrspace(1) %"52", align 4
store i32 %"43", ptr addrspace(5) %"38", align 4
%"45" = load i64, ptr addrspace(5) %"36", align 4
%"53" = inttoptr i64 %"45" to ptr addrspace(1)
%"32" = getelementptr inbounds i8, ptr addrspace(1) %"53", i64 4
%"46" = load i32, ptr addrspace(1) %"32", align 4
store i32 %"46", ptr addrspace(5) %"39", align 4
%"48" = load i32, ptr addrspace(5) %"38", align 4
%"49" = load i32, ptr addrspace(5) %"39", align 4
%2 = sext i32 %"48" to i64
%3 = sext i32 %"49" to i64
%"47" = mul i64 %2, %3
store i64 %"47", ptr addrspace(5) %"40", align 4
%"50" = load i64, ptr addrspace(5) %"37", align 4
%"51" = load i64, ptr addrspace(5) %"40", align 4
%"54" = inttoptr i64 %"50" to ptr
store i64 %"51", ptr %"54", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

29
ptx/src/test/ll/neg.ll Normal file
View File

@ -0,0 +1,29 @@
define amdgpu_kernel void @neg(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
%"32" = alloca i64, align 8, addrspace(5)
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"29"
"29": ; preds = %1
%"35" = load i64, ptr addrspace(4) %"30", align 4
store i64 %"35", ptr addrspace(5) %"32", align 4
%"36" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"36", ptr addrspace(5) %"33", align 4
%"38" = load i64, ptr addrspace(5) %"32", align 4
%"43" = inttoptr i64 %"38" to ptr
%"37" = load i32, ptr %"43", align 4
store i32 %"37", ptr addrspace(5) %"34", align 4
%"40" = load i32, ptr addrspace(5) %"34", align 4
%"39" = sub i32 0, %"40"
store i32 %"39", ptr addrspace(5) %"34", align 4
%"41" = load i64, ptr addrspace(5) %"33", align 4
%"42" = load i32, ptr addrspace(5) %"34", align 4
%"44" = inttoptr i64 %"41" to ptr
store i32 %"42", ptr %"44", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View File

@ -0,0 +1,35 @@
define amdgpu_kernel void @non_scalar_ptr_offset(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
%"36" = alloca i64, align 8, addrspace(5)
%"37" = alloca i64, align 8, addrspace(5)
%"38" = alloca i32, align 4, addrspace(5)
%"39" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"33"
"33": ; preds = %1
%"40" = load i64, ptr addrspace(4) %"34", align 4
store i64 %"40", ptr addrspace(5) %"36", align 4
%"41" = load i64, ptr addrspace(4) %"35", align 4
store i64 %"41", ptr addrspace(5) %"37", align 4
%"42" = load i64, ptr addrspace(5) %"36", align 4
%"50" = inttoptr i64 %"42" to ptr addrspace(1)
%"32" = getelementptr inbounds i8, ptr addrspace(1) %"50", i64 8
%"30" = load <2 x i32>, ptr addrspace(1) %"32", align 8
%"43" = extractelement <2 x i32> %"30", i8 0
%"44" = extractelement <2 x i32> %"30", i8 1
store i32 %"43", ptr addrspace(5) %"38", align 4
store i32 %"44", ptr addrspace(5) %"39", align 4
%"46" = load i32, ptr addrspace(5) %"38", align 4
%"47" = load i32, ptr addrspace(5) %"39", align 4
%"45" = add i32 %"46", %"47"
store i32 %"45", ptr addrspace(5) %"38", align 4
%"48" = load i64, ptr addrspace(5) %"37", align 4
%"49" = load i32, ptr addrspace(5) %"38", align 4
%"51" = inttoptr i64 %"48" to ptr addrspace(1)
store i32 %"49", ptr addrspace(1) %"51", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

30
ptx/src/test/ll/not.ll Normal file
View File

@ -0,0 +1,30 @@
define amdgpu_kernel void @not(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca i64, align 8, addrspace(5)
%"35" = alloca i64, align 8, addrspace(5)
%"36" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"30"
"30": ; preds = %1
%"37" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"37", ptr addrspace(5) %"33", align 4
%"38" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"38", ptr addrspace(5) %"34", align 4
%"40" = load i64, ptr addrspace(5) %"33", align 4
%"45" = inttoptr i64 %"40" to ptr
%"39" = load i64, ptr %"45", align 4
store i64 %"39", ptr addrspace(5) %"35", align 4
%"42" = load i64, ptr addrspace(5) %"35", align 4
%"46" = xor i64 %"42", -1
store i64 %"46", ptr addrspace(5) %"36", align 4
%"43" = load i64, ptr addrspace(5) %"34", align 4
%"44" = load i64, ptr addrspace(5) %"36", align 4
%"48" = inttoptr i64 %"43" to ptr
store i64 %"44", ptr %"48", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

39
ptx/src/test/ll/ntid.ll Normal file
View File

@ -0,0 +1,39 @@
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
define amdgpu_kernel void @ntid(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 {
%"37" = alloca i64, align 8, addrspace(5)
%"38" = alloca i64, align 8, addrspace(5)
%"39" = alloca i32, align 4, addrspace(5)
%"40" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"32"
"32": ; preds = %1
%"41" = load i64, ptr addrspace(4) %"35", align 4
store i64 %"41", ptr addrspace(5) %"37", align 4
%"42" = load i64, ptr addrspace(4) %"36", align 4
store i64 %"42", ptr addrspace(5) %"38", align 4
%"44" = load i64, ptr addrspace(5) %"37", align 4
%"51" = inttoptr i64 %"44" to ptr
%"43" = load i32, ptr %"51", align 4
store i32 %"43", ptr addrspace(5) %"39", align 4
%"31" = call i32 @__zluda_ptx_impl_sreg_ntid(i8 0)
br label %"33"
"33": ; preds = %"32"
store i32 %"31", ptr addrspace(5) %"40", align 4
%"47" = load i32, ptr addrspace(5) %"39", align 4
%"48" = load i32, ptr addrspace(5) %"40", align 4
%"46" = add i32 %"47", %"48"
store i32 %"46", ptr addrspace(5) %"39", align 4
%"49" = load i64, ptr addrspace(5) %"38", align 4
%"50" = load i32, ptr addrspace(5) %"39", align 4
%"52" = inttoptr i64 %"49" to ptr
store i32 %"50", ptr %"52", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

36
ptx/src/test/ll/or.ll Normal file
View File

@ -0,0 +1,36 @@
define amdgpu_kernel void @or(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
%"35" = alloca i64, align 8, addrspace(5)
%"36" = alloca i64, align 8, addrspace(5)
%"37" = alloca i64, align 8, addrspace(5)
%"38" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"32"
"32": ; preds = %1
%"39" = load i64, ptr addrspace(4) %"33", align 4
store i64 %"39", ptr addrspace(5) %"35", align 4
%"40" = load i64, ptr addrspace(4) %"34", align 4
store i64 %"40", ptr addrspace(5) %"36", align 4
%"42" = load i64, ptr addrspace(5) %"35", align 4
%"50" = inttoptr i64 %"42" to ptr
%"41" = load i64, ptr %"50", align 4
store i64 %"41", ptr addrspace(5) %"37", align 4
%"43" = load i64, ptr addrspace(5) %"35", align 4
%"51" = inttoptr i64 %"43" to ptr
%"31" = getelementptr inbounds i8, ptr %"51", i64 8
%"44" = load i64, ptr %"31", align 4
store i64 %"44", ptr addrspace(5) %"38", align 4
%"46" = load i64, ptr addrspace(5) %"37", align 4
%"47" = load i64, ptr addrspace(5) %"38", align 4
%"52" = or i64 %"46", %"47"
store i64 %"52", ptr addrspace(5) %"37", align 4
%"48" = load i64, ptr addrspace(5) %"36", align 4
%"49" = load i64, ptr addrspace(5) %"37", align 4
%"55" = inttoptr i64 %"48" to ptr
store i64 %"49", ptr %"55", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

33
ptx/src/test/ll/popc.ll Normal file
View File

@ -0,0 +1,33 @@
define amdgpu_kernel void @popc(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
%"32" = alloca i64, align 8, addrspace(5)
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"29"
"29": ; preds = %1
%"35" = load i64, ptr addrspace(4) %"30", align 4
store i64 %"35", ptr addrspace(5) %"32", align 4
%"36" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"36", ptr addrspace(5) %"33", align 4
%"38" = load i64, ptr addrspace(5) %"32", align 4
%"43" = inttoptr i64 %"38" to ptr
%"37" = load i32, ptr %"43", align 4
store i32 %"37", ptr addrspace(5) %"34", align 4
%"40" = load i32, ptr addrspace(5) %"34", align 4
%"44" = call i32 @llvm.ctpop.i32(i32 %"40")
store i32 %"44", ptr addrspace(5) %"34", align 4
%"41" = load i64, ptr addrspace(5) %"33", align 4
%"42" = load i32, ptr addrspace(5) %"34", align 4
%"45" = inttoptr i64 %"41" to ptr
store i32 %"42", ptr %"45", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.ctpop.i32(i32) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

View File

@ -0,0 +1,57 @@
define amdgpu_kernel void @pred_not(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 {
%"43" = alloca i64, align 8, addrspace(5)
%"44" = alloca i64, align 8, addrspace(5)
%"45" = alloca i64, align 8, addrspace(5)
%"46" = alloca i64, align 8, addrspace(5)
%"47" = alloca i64, align 8, addrspace(5)
%"48" = alloca i1, align 1, addrspace(5)
br label %1
1: ; preds = %0
br label %"40"
"40": ; preds = %1
%"49" = load i64, ptr addrspace(4) %"41", align 4
store i64 %"49", ptr addrspace(5) %"43", align 4
%"50" = load i64, ptr addrspace(4) %"42", align 4
store i64 %"50", ptr addrspace(5) %"44", align 4
%"52" = load i64, ptr addrspace(5) %"43", align 4
%"66" = inttoptr i64 %"52" to ptr
%"51" = load i64, ptr %"66", align 4
store i64 %"51", ptr addrspace(5) %"45", align 4
%"53" = load i64, ptr addrspace(5) %"43", align 4
%"67" = inttoptr i64 %"53" to ptr
%"37" = getelementptr inbounds i8, ptr %"67", i64 8
%"54" = load i64, ptr %"37", align 4
store i64 %"54", ptr addrspace(5) %"46", align 4
%"56" = load i64, ptr addrspace(5) %"45", align 4
%"57" = load i64, ptr addrspace(5) %"46", align 4
%"55" = icmp ult i64 %"56", %"57"
store i1 %"55", ptr addrspace(5) %"48", align 1
%"59" = load i1, ptr addrspace(5) %"48", align 1
%"58" = xor i1 %"59", true
store i1 %"58", ptr addrspace(5) %"48", align 1
%"60" = load i1, ptr addrspace(5) %"48", align 1
br i1 %"60", label %"16", label %"17"
"16": ; preds = %"40"
store i64 1, ptr addrspace(5) %"47", align 4
br label %"17"
"17": ; preds = %"16", %"40"
%"62" = load i1, ptr addrspace(5) %"48", align 1
br i1 %"62", label %"19", label %"18"
"18": ; preds = %"17"
store i64 2, ptr addrspace(5) %"47", align 4
br label %"19"
"19": ; preds = %"18", %"17"
%"64" = load i64, ptr addrspace(5) %"44", align 4
%"65" = load i64, ptr addrspace(5) %"47", align 4
%"68" = inttoptr i64 %"64" to ptr
store i64 %"65", ptr %"68", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

38
ptx/src/test/ll/prmt.ll Normal file
View File

@ -0,0 +1,38 @@
define amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
%"35" = alloca i64, align 8, addrspace(5)
%"36" = alloca i64, align 8, addrspace(5)
%"37" = alloca i32, align 4, addrspace(5)
%"38" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"32"
"32": ; preds = %1
%"39" = load i64, ptr addrspace(4) %"33", align 4
store i64 %"39", ptr addrspace(5) %"35", align 4
%"40" = load i64, ptr addrspace(4) %"34", align 4
store i64 %"40", ptr addrspace(5) %"36", align 4
%"42" = load i64, ptr addrspace(5) %"35", align 4
%"50" = inttoptr i64 %"42" to ptr
%"41" = load i32, ptr %"50", align 4
store i32 %"41", ptr addrspace(5) %"37", align 4
%"43" = load i64, ptr addrspace(5) %"35", align 4
%"51" = inttoptr i64 %"43" to ptr
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
%"44" = load i32, ptr %"31", align 4
store i32 %"44", ptr addrspace(5) %"38", align 4
%"46" = load i32, ptr addrspace(5) %"37", align 4
%"47" = load i32, ptr addrspace(5) %"38", align 4
%2 = bitcast i32 %"46" to <4 x i8>
%3 = bitcast i32 %"47" to <4 x i8>
%"52" = shufflevector <4 x i8> %2, <4 x i8> %3, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
store <4 x i8> %"52", ptr addrspace(5) %"38", align 4
%"48" = load i64, ptr addrspace(5) %"36", align 4
%"49" = load i32, ptr addrspace(5) %"38", align 4
%"55" = inttoptr i64 %"48" to ptr
store i32 %"49", ptr %"55", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

33
ptx/src/test/ll/rcp.ll Normal file
View File

@ -0,0 +1,33 @@
define amdgpu_kernel void @rcp(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
%"32" = alloca i64, align 8, addrspace(5)
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"29"
"29": ; preds = %1
%"35" = load i64, ptr addrspace(4) %"30", align 4
store i64 %"35", ptr addrspace(5) %"32", align 4
%"36" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"36", ptr addrspace(5) %"33", align 4
%"38" = load i64, ptr addrspace(5) %"32", align 4
%"43" = inttoptr i64 %"38" to ptr
%"37" = load float, ptr %"43", align 4
store float %"37", ptr addrspace(5) %"34", align 4
%"40" = load float, ptr addrspace(5) %"34", align 4
%"39" = call float @llvm.amdgcn.rcp.f32(float %"40")
store float %"39", ptr addrspace(5) %"34", align 4
%"41" = load i64, ptr addrspace(5) %"33", align 4
%"42" = load float, ptr addrspace(5) %"34", align 4
%"44" = inttoptr i64 %"41" to ptr
store float %"42", ptr %"44", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.amdgcn.rcp.f32(float) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

View File

@ -0,0 +1,36 @@
define amdgpu_kernel void @reg_local(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
%"10" = alloca [8 x i8], align 8, addrspace(5)
%"39" = alloca i64, align 8, addrspace(5)
%"40" = alloca i64, align 8, addrspace(5)
%"41" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
br label %"36"
"36": ; preds = %1
%"42" = load i64, ptr addrspace(4) %"37", align 4
store i64 %"42", ptr addrspace(5) %"39", align 4
%"43" = load i64, ptr addrspace(4) %"38", align 4
store i64 %"43", ptr addrspace(5) %"40", align 4
%"45" = load i64, ptr addrspace(5) %"39", align 4
%"51" = inttoptr i64 %"45" to ptr addrspace(1)
%"50" = load i64, ptr addrspace(1) %"51", align 4
store i64 %"50", ptr addrspace(5) %"41", align 4
%"46" = load i64, ptr addrspace(5) %"41", align 4
%"31" = add i64 %"46", 1
%"52" = addrspacecast ptr addrspace(5) %"10" to ptr
store i64 %"31", ptr %"52", align 4
%"54" = addrspacecast ptr addrspace(5) %"10" to ptr
%"33" = getelementptr inbounds i8, ptr %"54", i64 0
%"55" = load i64, ptr %"33", align 4
store i64 %"55", ptr addrspace(5) %"41", align 4
%"48" = load i64, ptr addrspace(5) %"40", align 4
%"56" = inttoptr i64 %"48" to ptr addrspace(1)
%"35" = getelementptr inbounds i8, ptr addrspace(1) %"56", i64 0
%"49" = load i64, ptr addrspace(5) %"41", align 4
store i64 %"49", ptr addrspace(1) %"35", align 4
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

Some files were not shown because too many files have changed in this diff Show More