mirror of
https://github.com/vosen/ZLUDA.git
synced 2025-04-23 18:08:57 +03:00
Compare commits
No commits in common. "master" and "v4" have entirely different histories.
@ -1,2 +0,0 @@
|
|||||||
[alias]
|
|
||||||
xtask = "run --package xtask --"
|
|
@ -32,7 +32,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get update -y && DEBIAN_FRONTEND=noninter
|
|||||||
cuda-profiler-api-${CUDA_PKG_VERSION} \
|
cuda-profiler-api-${CUDA_PKG_VERSION} \
|
||||||
cuda-nvcc-${CUDA_PKG_VERSION}
|
cuda-nvcc-${CUDA_PKG_VERSION}
|
||||||
|
|
||||||
ARG ROCM_VERSION=6.3.1
|
ARG ROCM_VERSION=6.2.2
|
||||||
RUN mkdir --parents --mode=0755 /etc/apt/keyrings && \
|
RUN mkdir --parents --mode=0755 /etc/apt/keyrings && \
|
||||||
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
|
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
|
||||||
gpg --dearmor | tee /etc/apt/keyrings/rocm.gpg > /dev/null && \
|
gpg --dearmor | tee /etc/apt/keyrings/rocm.gpg > /dev/null && \
|
||||||
@ -43,7 +43,7 @@ RUN mkdir --parents --mode=0755 /etc/apt/keyrings && \
|
|||||||
rocm-gdb \
|
rocm-gdb \
|
||||||
rocm-smi-lib \
|
rocm-smi-lib \
|
||||||
rocm-llvm-dev \
|
rocm-llvm-dev \
|
||||||
hip-runtime-amd \
|
hip-runtime-amd && \
|
||||||
hip-dev && \
|
hip-dev && \
|
||||||
echo '/opt/rocm/lib' > /etc/ld.so.conf.d/rocm.conf && \
|
echo '/opt/rocm/lib' > /etc/ld.so.conf.d/rocm.conf && \
|
||||||
ldconfig
|
ldconfig
|
||||||
|
61
CONTRIBUTING.md
Normal file
61
CONTRIBUTING.md
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
# Dependencies
|
||||||
|
|
||||||
|
Development builds of ZLUDA requires following dependencies:
|
||||||
|
|
||||||
|
* CMake
|
||||||
|
* Python 3
|
||||||
|
|
||||||
|
Additionally the repository has to be cloned with Git submodules initalized. If you cloned the repo without initalizing submodules, do this:
|
||||||
|
```
|
||||||
|
git submodule update --init --recursive
|
||||||
|
```
|
||||||
|
|
||||||
|
# Tests
|
||||||
|
|
||||||
|
Tests should be executed with `--workspace` option to test non-default targets:
|
||||||
|
```
|
||||||
|
cargo test --workspace
|
||||||
|
```
|
||||||
|
|
||||||
|
# Debugging
|
||||||
|
|
||||||
|
## Debuggging CUDA applications
|
||||||
|
|
||||||
|
When running an application with ZLUDA quite often you will run into subtle bugs or incompatibilities in the generated GPU code. The best way to debug an application's GPU CUDA code is to use ZLUDA dumper.
|
||||||
|
|
||||||
|
Library `zluda_dump` can be injected into a CUDA application and produce a trace which, for every launched GPU function contains:
|
||||||
|
* PTX source
|
||||||
|
* Launch arguments (block size, grid size, shared memory size)
|
||||||
|
* Dump of function arguments. Both after and before
|
||||||
|
|
||||||
|
Example use with GeekBench:
|
||||||
|
```
|
||||||
|
set ZLUDA_DUMP_KERNEL=knn_match
|
||||||
|
set ZLUDA_DUMP_DIR=C:\temp\zluda_dump
|
||||||
|
"<ZLUDA_PATH>\zluda_with.exe" "<ZLUDA_PATH>\zluda_dump.dll" -- "geekbench_x86_64.exe" --compute CUDA
|
||||||
|
```
|
||||||
|
|
||||||
|
The example above, for every execution of GPU function `knn_match`, will save its details into the directory `C:\temp\zluda_dump`
|
||||||
|
|
||||||
|
This dump can be replayed with `replay.py` script from `zluda_dump` source directory. Use it like this:
|
||||||
|
```
|
||||||
|
python replay.py "C:\temp\zluda_dump\geekbench_x86_64.exe"
|
||||||
|
```
|
||||||
|
You must copy (or symlink) ZLUDA `nvcuda.dll` into PyCUDA directory, so it will run using ZLUDA. Example output:
|
||||||
|
```
|
||||||
|
Intel(R) Graphics [0x3e92] [github.com/vosen/ZLUDA]
|
||||||
|
C:\temp\zluda_dump\geekbench_x86_64.exe\4140_scale_pyramid
|
||||||
|
C:\temp\zluda_dump\geekbench_x86_64.exe\4345_convolve_1d_vertical_grayscale
|
||||||
|
Skipping, launch block size (512) bigger than maximum block size (256)
|
||||||
|
C:\temp\zluda_dump\geekbench_x86_64.exe\4480_scale_pyramid
|
||||||
|
6:
|
||||||
|
Arrays are not equal
|
||||||
|
|
||||||
|
Mismatched elements: 1200 / 19989588 (0.006%)
|
||||||
|
Max absolute difference: 255
|
||||||
|
Max relative difference: 255.
|
||||||
|
x: array([ 7, 6, 8, ..., 193, 195, 193], dtype=uint8)
|
||||||
|
y: array([ 7, 6, 8, ..., 193, 195, 193], dtype=uint8)
|
||||||
|
```
|
||||||
|
From this output one can observe that in kernel launch 4480, 6th argument to function `scale_pyramid` differs between what was executed on an NVIDIA GPU using CUDA and Intel GPU using ZLUDA.
|
||||||
|
__Important__: It's impossible to infer what was the type (and semantics) of argument passed to a GPU function. At our level it's a buffer of bytes and by default `replay.py` simply checks if two buffers are byte-equal. That means you will have a ton of false negatives when running `replay.py`. You should override them for your particular case in `replay.py` - it already contains some overrides for GeekBench kernels
|
1395
Cargo.lock
generated
1395
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -18,16 +18,7 @@ members = [
|
|||||||
"ptx_parser",
|
"ptx_parser",
|
||||||
"ptx_parser_macros",
|
"ptx_parser_macros",
|
||||||
"ptx_parser_macros_impl",
|
"ptx_parser_macros_impl",
|
||||||
"xtask",
|
|
||||||
"zluda_bindgen",
|
"zluda_bindgen",
|
||||||
]
|
]
|
||||||
|
|
||||||
default-members = ["zluda", "zluda_ml", "zluda_inject", "zluda_redirect"]
|
default-members = ["zluda", "zluda_ml", "zluda_inject", "zluda_redirect"]
|
||||||
|
|
||||||
[profile.release-lto]
|
|
||||||
inherits = "release"
|
|
||||||
codegen-units = 1
|
|
||||||
lto = true
|
|
||||||
|
|
||||||
[profile.dev.package.xtask]
|
|
||||||
opt-level = 2
|
|
||||||
|
1
GeekBench_5_2_3.svg
Normal file
1
GeekBench_5_2_3.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 259 KiB |
60
README.md
60
README.md
@ -4,23 +4,18 @@
|
|||||||
|
|
||||||
ZLUDA is a drop-in replacement for CUDA on non-NVIDIA GPU. ZLUDA allows to run unmodified CUDA applications using non-NVIDIA GPUs with near-native performance.
|
ZLUDA is a drop-in replacement for CUDA on non-NVIDIA GPU. ZLUDA allows to run unmodified CUDA applications using non-NVIDIA GPUs with near-native performance.
|
||||||
|
|
||||||
ZLUDA supports AMD Radeon RX 5000 series and newer GPUs (both desktop and integrated).
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
ZLUDA is work in progress. Follow development here and say hi on [Discord](https://discord.gg/sg6BNzXuc7). For more details see the announcement: https://vosen.github.io/ZLUDA/blog/zludas-third-life/
|
ZLUDA is work in progress. Follow development here and say hi on [Discord](https://discord.gg/sg6BNzXuc7). For more details see the announcement: https://vosen.github.io/ZLUDA/blog/zludas-third-life/
|
||||||
|
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
**Warning**: This version ZLUDA is under heavy development (more [here](https://vosen.github.io/ZLUDA/blog/zludas-third-life/)) and right now only supports Geekbench. ZLUDA probably will not work with your application just yet.
|
**Warning**: ZLUDA is under heavy development (see news [here](https://vosen.github.io/ZLUDA/blog/zludas-third-life/)). Instructions below might not work.
|
||||||
|
|
||||||
### Windows
|
### Windows
|
||||||
You should have recent AMD GPU driver ("AMD Software: Adrenalin Edition") installed.\
|
You should have the most recent ROCm installed.\
|
||||||
To run your application you should etiher:
|
Run your application like this:
|
||||||
* (Recommended approach) Copy ZLUDA-provided `nvcuda.dll` and `nvml.dll` from `target\release` (if built from sources) or `zluda` (if downloaded a zip package) into a path which your application uses to load CUDA. Paths vary application to application, but usually it's the directory where the .exe file is located
|
```
|
||||||
* Use ZLUDA launcher like below. ZLUDA launcher is known to be buggy and incomplete:
|
<ZLUDA_DIRECTORY>\zluda_with.exe -- <APPLICATION> <APPLICATIONS_ARGUMENTS>
|
||||||
```
|
```
|
||||||
<ZLUDA_DIRECTORY>\zluda_with.exe -- <APPLICATION> <APPLICATIONS_ARGUMENTS>
|
|
||||||
```
|
|
||||||
|
|
||||||
### Linux
|
### Linux
|
||||||
|
|
||||||
@ -29,44 +24,33 @@ Run your application like this:
|
|||||||
LD_LIBRARY_PATH=<ZLUDA_DIRECTORY> <APPLICATION> <APPLICATIONS_ARGUMENTS>
|
LD_LIBRARY_PATH=<ZLUDA_DIRECTORY> <APPLICATION> <APPLICATIONS_ARGUMENTS>
|
||||||
```
|
```
|
||||||
|
|
||||||
where `<ZLUDA_DIRECTORY>` is the directory which contains ZLUDA-provided `libcuda.so`: `target/release` if you built from sources or `zluda` if you downloaded prebuilt package.
|
|
||||||
|
|
||||||
### MacOS
|
### MacOS
|
||||||
|
|
||||||
Not supported
|
Not supported
|
||||||
|
|
||||||
## Building
|
## Building
|
||||||
|
**Warning**: ZLUDA is under heavy development (see news [here](https://vosen.github.io/ZLUDA/blog/zludas-third-life/)). Instructions below might not work.
|
||||||
|
|
||||||
### Dependencies
|
_Note_: This repo has submodules. Make sure to recurse submodules when cloning this repo, e.g.: `git clone --recursive https://github.com/vosen/ZLUDA.git`
|
||||||
|
|
||||||
* Git
|
You should have a relatively recent version of Rust installed, then you just do:
|
||||||
* CMake
|
|
||||||
* Python 3
|
|
||||||
* Rust compiler (recent version)
|
|
||||||
* C++ compiler
|
|
||||||
* (Optional, but recommended) [Ninja build system](https://ninja-build.org/)
|
|
||||||
|
|
||||||
### Build steps
|
```
|
||||||
|
cargo build --release
|
||||||
|
```
|
||||||
|
in the main directory of the project.
|
||||||
|
### Linux
|
||||||
|
|
||||||
* Git clone the repo (make sure to use `--recursive` option to fetch submodules):
|
If you are building on Linux you must also symlink (or rename) the ZLUDA output binaries after ZLUDA build finishes:
|
||||||
`git clone --recursive https://github.com/vosen/ZLUDA.git`
|
```
|
||||||
* Enter freshly cloned `ZLUDA` directory and build with cargo (this takes a while):
|
ln -s libnvcuda.so target/release/libcuda.so
|
||||||
`cargo xtask --release`
|
ln -s libnvcuda.so target/release/libcuda.so.1
|
||||||
|
ln -s libnvml.so target/release/libnvidia-ml.so
|
||||||
|
```
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
ZLUDA project has a commercial backing and _does not_ accept donations.
|
If you want to develop ZLUDA itself, read [CONTRIBUTING.md](CONTRIBUTING.md), it contains instructions how to set up dependencies and run tests
|
||||||
ZLUDA project accepts pull requests and other non-monetary contributions.
|
|
||||||
|
|
||||||
If you want to contribute a code fix or documentation update feel free to open a Pull Request.
|
|
||||||
|
|
||||||
### Getting started
|
|
||||||
|
|
||||||
There's no architecture document (yet). Two most important crates in ZLUDA are `ptx` (PTX compiler) and `zluda` (AMD GPU runtime). A good starting point to tinkering the project is to run one of the `ptx` unit tests under a debugger and understand what it is doing. `cargo test -p ptx -- ::add_hip` is a simple test that adds two numbers.
|
|
||||||
|
|
||||||
Github issues tagged with ["help wanted"](https://github.com/vosen/ZLUDA/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) are tasks that are self-containted. Their level of difficulty varies, they are not always good beginner tasks, but they defined unambiguously.
|
|
||||||
|
|
||||||
If you have questions feel free to ask on [#devtalk channel on Discord](https://discord.com/channels/1273316903783497778/1303329281409159270).
|
|
||||||
|
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
@ -133,26 +133,21 @@ pub fn compile_bitcode(
|
|||||||
&linking_info,
|
&linking_info,
|
||||||
amd_comgr_action_kind_t::AMD_COMGR_ACTION_LINK_BC_TO_BC,
|
amd_comgr_action_kind_t::AMD_COMGR_ACTION_LINK_BC_TO_BC,
|
||||||
)?;
|
)?;
|
||||||
let compile_to_exec = ActionInfo::new()?;
|
let link_with_device_libs_info = ActionInfo::new()?;
|
||||||
compile_to_exec.set_isa_name(gcn_arch)?;
|
link_with_device_libs_info.set_isa_name(gcn_arch)?;
|
||||||
compile_to_exec.set_language(amd_comgr_language_t::AMD_COMGR_LANGUAGE_LLVM_IR)?;
|
link_with_device_libs_info.set_language(amd_comgr_language_t::AMD_COMGR_LANGUAGE_LLVM_IR)?;
|
||||||
let common_options = [
|
|
||||||
// This makes no sense, but it makes ockl linking work
|
// This makes no sense, but it makes ockl linking work
|
||||||
c"-Xclang",
|
link_with_device_libs_info
|
||||||
c"-mno-link-builtin-bitcode-postopt",
|
.set_options([c"-Xclang", c"-mno-link-builtin-bitcode-postopt"].into_iter())?;
|
||||||
// Otherwise LLVM omits dynamic fp mode for ockl functions during linking
|
let with_device_libs = do_action(
|
||||||
// and then fails to inline them
|
&linked_data_set,
|
||||||
c"-Xclang",
|
&link_with_device_libs_info,
|
||||||
c"-fdenormal-fp-math=dynamic",
|
amd_comgr_action_kind_t::AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC,
|
||||||
c"-O3",
|
)?;
|
||||||
c"-mno-wavefrontsize64",
|
let compile_action_info = ActionInfo::new()?;
|
||||||
c"-mcumode",
|
compile_action_info.set_isa_name(gcn_arch)?;
|
||||||
// Useful for inlining reports, combined with AMD_COMGR_SAVE_TEMPS=1 AMD_COMGR_EMIT_VERBOSE_LOGS=1 AMD_COMGR_REDIRECT_LOGS=stderr
|
let common_options = [c"-O3", c"-mno-wavefrontsize64", c"-mcumode"].into_iter();
|
||||||
// c"-fsave-optimization-record=yaml",
|
|
||||||
]
|
|
||||||
.into_iter();
|
|
||||||
let opt_options = if cfg!(debug_assertions) {
|
let opt_options = if cfg!(debug_assertions) {
|
||||||
//[c"-g", c"-mllvm", c"-print-before-all", c"", c""]
|
|
||||||
[c"-g", c"", c"", c"", c""]
|
[c"-g", c"", c"", c"", c""]
|
||||||
} else {
|
} else {
|
||||||
[
|
[
|
||||||
@ -164,11 +159,16 @@ pub fn compile_bitcode(
|
|||||||
c"-inlinehint-threshold=3250",
|
c"-inlinehint-threshold=3250",
|
||||||
]
|
]
|
||||||
};
|
};
|
||||||
compile_to_exec.set_options(common_options.chain(opt_options))?;
|
compile_action_info.set_options(common_options.chain(opt_options))?;
|
||||||
|
let reloc_data_set = do_action(
|
||||||
|
&with_device_libs,
|
||||||
|
&compile_action_info,
|
||||||
|
amd_comgr_action_kind_t::AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE,
|
||||||
|
)?;
|
||||||
let exec_data_set = do_action(
|
let exec_data_set = do_action(
|
||||||
&linked_data_set,
|
&reloc_data_set,
|
||||||
&compile_to_exec,
|
&compile_action_info,
|
||||||
amd_comgr_action_kind_t::AMD_COMGR_ACTION_COMPILE_SOURCE_TO_EXECUTABLE,
|
amd_comgr_action_kind_t::AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE,
|
||||||
)?;
|
)?;
|
||||||
let executable =
|
let executable =
|
||||||
exec_data_set.get_data(amd_comgr_data_kind_t::AMD_COMGR_DATA_KIND_EXECUTABLE, 0)?;
|
exec_data_set.get_data(amd_comgr_data_kind_t::AMD_COMGR_DATA_KIND_EXECUTABLE, 0)?;
|
||||||
|
2
ext/amd_comgr-sys/build.rs
vendored
2
ext/amd_comgr-sys/build.rs
vendored
@ -13,7 +13,7 @@ fn main() -> Result<(), VarError> {
|
|||||||
println!("cargo:rustc-link-search=native=C:\\Windows\\System32");
|
println!("cargo:rustc-link-search=native=C:\\Windows\\System32");
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
println!("cargo:rustc-link-lib=dylib:+verbatim=libamd_comgr.so.2");
|
println!("cargo:rustc-link-lib=dylib=amd_comgr");
|
||||||
println!("cargo:rustc-link-search=native=/opt/rocm/lib/");
|
println!("cargo:rustc-link-search=native=/opt/rocm/lib/");
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
|
2
ext/hip_runtime-sys/build.rs
vendored
2
ext/hip_runtime-sys/build.rs
vendored
@ -13,7 +13,7 @@ fn main() -> Result<(), VarError> {
|
|||||||
println!("cargo:rustc-link-search=native=C:\\Windows\\System32");
|
println!("cargo:rustc-link-search=native=C:\\Windows\\System32");
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
println!("cargo:rustc-link-lib=dylib:+verbatim=libamdhip64.so.6");
|
println!("cargo:rustc-link-lib=dylib=amdhip64");
|
||||||
println!("cargo:rustc-link-search=native=/opt/rocm/lib/");
|
println!("cargo:rustc-link-search=native=/opt/rocm/lib/");
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
|
File diff suppressed because one or more lines are too long
Before Width: | Height: | Size: 287 KiB |
@ -17,10 +17,6 @@ bitflags = "1.2"
|
|||||||
rustc-hash = "2.0.0"
|
rustc-hash = "2.0.0"
|
||||||
strum = "0.26"
|
strum = "0.26"
|
||||||
strum_macros = "0.26"
|
strum_macros = "0.26"
|
||||||
petgraph = "0.7.1"
|
|
||||||
microlp = "0.2.10"
|
|
||||||
int-enum = "1.1"
|
|
||||||
unwrap_or = "1.0.1"
|
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
hip_runtime-sys = { path = "../ext/hip_runtime-sys" }
|
hip_runtime-sys = { path = "../ext/hip_runtime-sys" }
|
||||||
@ -28,4 +24,3 @@ comgr = { path = "../comgr" }
|
|||||||
tempfile = "3"
|
tempfile = "3"
|
||||||
paste = "1.0"
|
paste = "1.0"
|
||||||
cuda-driver-sys = "0.3.0"
|
cuda-driver-sys = "0.3.0"
|
||||||
pretty_assertions = "1.4.1"
|
|
@ -2,8 +2,8 @@ use super::*;
|
|||||||
|
|
||||||
pub(super) fn run<'a, 'input>(
|
pub(super) fn run<'a, 'input>(
|
||||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||||
directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||||
directives
|
directives
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|directive| run_directive(resolver, directive))
|
.map(|directive| run_directive(resolver, directive))
|
||||||
@ -12,8 +12,8 @@ pub(super) fn run<'a, 'input>(
|
|||||||
|
|
||||||
fn run_directive<'input>(
|
fn run_directive<'input>(
|
||||||
resolver: &mut GlobalStringIdentResolver2,
|
resolver: &mut GlobalStringIdentResolver2,
|
||||||
directive: Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
|
directive: Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
|
||||||
) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||||
Ok(match directive {
|
Ok(match directive {
|
||||||
var @ Directive2::Variable(..) => var,
|
var @ Directive2::Variable(..) => var,
|
||||||
Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?),
|
Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?),
|
||||||
@ -22,13 +22,13 @@ fn run_directive<'input>(
|
|||||||
|
|
||||||
fn run_method<'input>(
|
fn run_method<'input>(
|
||||||
resolver: &mut GlobalStringIdentResolver2,
|
resolver: &mut GlobalStringIdentResolver2,
|
||||||
mut method: Function2<ast::Instruction<SpirvWord>, SpirvWord>,
|
mut method: Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
|
||||||
) -> Result<Function2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
) -> Result<Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||||
let is_declaration = method.body.is_none();
|
let is_declaration = method.body.is_none();
|
||||||
let mut body = Vec::new();
|
let mut body = Vec::new();
|
||||||
let mut remap_returns = Vec::new();
|
let mut remap_returns = Vec::new();
|
||||||
if !method.is_kernel {
|
if !method.func_decl.name.is_kernel() {
|
||||||
for arg in method.return_arguments.iter_mut() {
|
for arg in method.func_decl.return_arguments.iter_mut() {
|
||||||
match arg.state_space {
|
match arg.state_space {
|
||||||
ptx_parser::StateSpace::Param => {
|
ptx_parser::StateSpace::Param => {
|
||||||
arg.state_space = ptx_parser::StateSpace::Reg;
|
arg.state_space = ptx_parser::StateSpace::Reg;
|
||||||
@ -51,7 +51,7 @@ fn run_method<'input>(
|
|||||||
_ => return Err(error_unreachable()),
|
_ => return Err(error_unreachable()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for arg in method.input_arguments.iter_mut() {
|
for arg in method.func_decl.input_arguments.iter_mut() {
|
||||||
match arg.state_space {
|
match arg.state_space {
|
||||||
ptx_parser::StateSpace::Param => {
|
ptx_parser::StateSpace::Param => {
|
||||||
arg.state_space = ptx_parser::StateSpace::Reg;
|
arg.state_space = ptx_parser::StateSpace::Reg;
|
||||||
@ -95,7 +95,14 @@ fn run_method<'input>(
|
|||||||
Ok::<_, TranslateError>(body)
|
Ok::<_, TranslateError>(body)
|
||||||
})
|
})
|
||||||
.transpose()?;
|
.transpose()?;
|
||||||
Ok(Function2 { body, ..method })
|
Ok(Function2 {
|
||||||
|
func_decl: method.func_decl,
|
||||||
|
globals: method.globals,
|
||||||
|
body,
|
||||||
|
import_as: method.import_as,
|
||||||
|
tuning: method.tuning,
|
||||||
|
linkage: method.linkage,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn run_statement<'input>(
|
fn run_statement<'input>(
|
||||||
|
@ -36,7 +36,6 @@ use llvm_zluda::bit_writer::LLVMWriteBitcodeToMemoryBuffer;
|
|||||||
use llvm_zluda::{core::*, *};
|
use llvm_zluda::{core::*, *};
|
||||||
use llvm_zluda::{prelude::*, LLVMZludaBuildAtomicRMW};
|
use llvm_zluda::{prelude::*, LLVMZludaBuildAtomicRMW};
|
||||||
use llvm_zluda::{LLVMCallConv, LLVMZludaBuildAlloca};
|
use llvm_zluda::{LLVMCallConv, LLVMZludaBuildAlloca};
|
||||||
use ptx_parser::Mul24Control;
|
|
||||||
|
|
||||||
const LLVM_UNNAMED: &CStr = c"";
|
const LLVM_UNNAMED: &CStr = c"";
|
||||||
// https://llvm.org/docs/AMDGPUUsage.html#address-spaces
|
// https://llvm.org/docs/AMDGPUUsage.html#address-spaces
|
||||||
@ -66,24 +65,17 @@ impl Drop for Context {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Module(LLVMModuleRef, Context);
|
struct Module(LLVMModuleRef);
|
||||||
|
|
||||||
impl Module {
|
impl Module {
|
||||||
fn new(ctx: Context, name: &CStr) -> Self {
|
fn new(ctx: &Context, name: &CStr) -> Self {
|
||||||
Self(
|
Self(unsafe { LLVMModuleCreateWithNameInContext(name.as_ptr(), ctx.get()) })
|
||||||
unsafe { LLVMModuleCreateWithNameInContext(name.as_ptr(), ctx.get()) },
|
|
||||||
ctx,
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get(&self) -> LLVMModuleRef {
|
fn get(&self) -> LLVMModuleRef {
|
||||||
self.0
|
self.0
|
||||||
}
|
}
|
||||||
|
|
||||||
fn context(&self) -> &Context {
|
|
||||||
&self.1
|
|
||||||
}
|
|
||||||
|
|
||||||
fn verify(&self) -> Result<(), Message> {
|
fn verify(&self) -> Result<(), Message> {
|
||||||
let mut err = ptr::null_mut();
|
let mut err = ptr::null_mut();
|
||||||
let error = unsafe {
|
let error = unsafe {
|
||||||
@ -100,15 +92,10 @@ impl Module {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn write_bitcode_to_memory(&self) -> MemoryBuffer {
|
fn write_bitcode_to_memory(&self) -> MemoryBuffer {
|
||||||
let memory_buffer = unsafe { LLVMWriteBitcodeToMemoryBuffer(self.get()) };
|
let memory_buffer = unsafe { LLVMWriteBitcodeToMemoryBuffer(self.get()) };
|
||||||
MemoryBuffer(memory_buffer)
|
MemoryBuffer(memory_buffer)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn print_module_to_string(&self) -> Message {
|
|
||||||
let asm = unsafe { LLVMPrintModuleToString(self.get()) };
|
|
||||||
Message(unsafe { CStr::from_ptr(asm) })
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Drop for Module {
|
impl Drop for Module {
|
||||||
@ -143,7 +130,7 @@ impl Drop for Builder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Message(&'static CStr);
|
struct Message(&'static CStr);
|
||||||
|
|
||||||
impl Drop for Message {
|
impl Drop for Message {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
@ -159,12 +146,6 @@ impl std::fmt::Debug for Message {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Message {
|
|
||||||
pub fn to_str(&self) -> &str {
|
|
||||||
self.0.to_str().unwrap().trim()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct MemoryBuffer(LLVMMemoryBufferRef);
|
pub struct MemoryBuffer(LLVMMemoryBufferRef);
|
||||||
|
|
||||||
impl Drop for MemoryBuffer {
|
impl Drop for MemoryBuffer {
|
||||||
@ -187,11 +168,11 @@ impl Deref for MemoryBuffer {
|
|||||||
|
|
||||||
pub(super) fn run<'input>(
|
pub(super) fn run<'input>(
|
||||||
id_defs: GlobalStringIdentResolver2<'input>,
|
id_defs: GlobalStringIdentResolver2<'input>,
|
||||||
directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||||
) -> Result<Module, TranslateError> {
|
) -> Result<MemoryBuffer, TranslateError> {
|
||||||
let context = Context::new();
|
let context = Context::new();
|
||||||
let module = Module::new(context, LLVM_UNNAMED);
|
let module = Module::new(&context, LLVM_UNNAMED);
|
||||||
let mut emit_ctx = ModuleEmitContext::new(&module, &id_defs);
|
let mut emit_ctx = ModuleEmitContext::new(&context, &module, &id_defs);
|
||||||
for directive in directives {
|
for directive in directives {
|
||||||
match directive {
|
match directive {
|
||||||
Directive2::Variable(linking, variable) => emit_ctx.emit_global(linking, variable)?,
|
Directive2::Variable(linking, variable) => emit_ctx.emit_global(linking, variable)?,
|
||||||
@ -201,7 +182,7 @@ pub(super) fn run<'input>(
|
|||||||
if let Err(err) = module.verify() {
|
if let Err(err) = module.verify() {
|
||||||
panic!("{:?}", err);
|
panic!("{:?}", err);
|
||||||
}
|
}
|
||||||
Ok(module)
|
Ok(module.write_bitcode_to_memory())
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ModuleEmitContext<'a, 'input> {
|
struct ModuleEmitContext<'a, 'input> {
|
||||||
@ -213,8 +194,11 @@ struct ModuleEmitContext<'a, 'input> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, 'input> ModuleEmitContext<'a, 'input> {
|
impl<'a, 'input> ModuleEmitContext<'a, 'input> {
|
||||||
fn new(module: &Module, id_defs: &'a GlobalStringIdentResolver2<'input>) -> Self {
|
fn new(
|
||||||
let context = module.context();
|
context: &Context,
|
||||||
|
module: &Module,
|
||||||
|
id_defs: &'a GlobalStringIdentResolver2<'input>,
|
||||||
|
) -> Self {
|
||||||
ModuleEmitContext {
|
ModuleEmitContext {
|
||||||
context: context.get(),
|
context: context.get(),
|
||||||
module: module.get(),
|
module: module.get(),
|
||||||
@ -234,20 +218,24 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
|
|||||||
|
|
||||||
fn emit_method(
|
fn emit_method(
|
||||||
&mut self,
|
&mut self,
|
||||||
method: Function2<ast::Instruction<SpirvWord>, SpirvWord>,
|
method: Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
|
||||||
) -> Result<(), TranslateError> {
|
) -> Result<(), TranslateError> {
|
||||||
|
let func_decl = method.func_decl;
|
||||||
let name = method
|
let name = method
|
||||||
.import_as
|
.import_as
|
||||||
.as_deref()
|
.as_deref()
|
||||||
.or_else(|| self.id_defs.ident_map[&method.name].name.as_deref())
|
.or_else(|| match func_decl.name {
|
||||||
|
ast::MethodName::Kernel(name) => Some(name),
|
||||||
|
ast::MethodName::Func(id) => self.id_defs.ident_map[&id].name.as_deref(),
|
||||||
|
})
|
||||||
.ok_or_else(|| error_unreachable())?;
|
.ok_or_else(|| error_unreachable())?;
|
||||||
let name = CString::new(name).map_err(|_| error_unreachable())?;
|
let name = CString::new(name).map_err(|_| error_unreachable())?;
|
||||||
let mut fn_ = unsafe { LLVMGetNamedFunction(self.module, name.as_ptr()) };
|
let mut fn_ = unsafe { LLVMGetNamedFunction(self.module, name.as_ptr()) };
|
||||||
if fn_ == ptr::null_mut() {
|
if fn_ == ptr::null_mut() {
|
||||||
let fn_type = get_function_type(
|
let fn_type = get_function_type(
|
||||||
self.context,
|
self.context,
|
||||||
method.return_arguments.iter().map(|v| &v.v_type),
|
func_decl.return_arguments.iter().map(|v| &v.v_type),
|
||||||
method
|
func_decl
|
||||||
.input_arguments
|
.input_arguments
|
||||||
.iter()
|
.iter()
|
||||||
.map(|v| get_input_argument_type(self.context, &v.v_type, v.state_space)),
|
.map(|v| get_input_argument_type(self.context, &v.v_type, v.state_space)),
|
||||||
@ -257,28 +245,15 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
|
|||||||
self.emit_fn_attribute(fn_, "uniform-work-group-size", "true");
|
self.emit_fn_attribute(fn_, "uniform-work-group-size", "true");
|
||||||
self.emit_fn_attribute(fn_, "no-trapping-math", "true");
|
self.emit_fn_attribute(fn_, "no-trapping-math", "true");
|
||||||
}
|
}
|
||||||
if !method.is_kernel {
|
if let ast::MethodName::Func(name) = func_decl.name {
|
||||||
self.resolver.register(method.name, fn_);
|
self.resolver.register(name, fn_);
|
||||||
self.emit_fn_attribute(fn_, "denormal-fp-math-f32", "dynamic");
|
|
||||||
self.emit_fn_attribute(fn_, "denormal-fp-math", "dynamic");
|
|
||||||
} else {
|
|
||||||
self.emit_fn_attribute(
|
|
||||||
fn_,
|
|
||||||
"denormal-fp-math-f32",
|
|
||||||
llvm_ftz(method.flush_to_zero_f32),
|
|
||||||
);
|
|
||||||
self.emit_fn_attribute(
|
|
||||||
fn_,
|
|
||||||
"denormal-fp-math",
|
|
||||||
llvm_ftz(method.flush_to_zero_f16f64),
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
for (i, param) in method.input_arguments.iter().enumerate() {
|
for (i, param) in func_decl.input_arguments.iter().enumerate() {
|
||||||
let value = unsafe { LLVMGetParam(fn_, i as u32) };
|
let value = unsafe { LLVMGetParam(fn_, i as u32) };
|
||||||
let name = self.resolver.get_or_add(param.name);
|
let name = self.resolver.get_or_add(param.name);
|
||||||
unsafe { LLVMSetValueName2(value, name.as_ptr().cast(), name.len()) };
|
unsafe { LLVMSetValueName2(value, name.as_ptr().cast(), name.len()) };
|
||||||
self.resolver.register(param.name, value);
|
self.resolver.register(param.name, value);
|
||||||
if method.is_kernel {
|
if func_decl.name.is_kernel() {
|
||||||
let attr_kind = unsafe {
|
let attr_kind = unsafe {
|
||||||
LLVMGetEnumAttributeKindForName(b"byref".as_ptr().cast(), b"byref".len())
|
LLVMGetEnumAttributeKindForName(b"byref".as_ptr().cast(), b"byref".len())
|
||||||
};
|
};
|
||||||
@ -292,7 +267,7 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
|
|||||||
unsafe { LLVMAddAttributeAtIndex(fn_, i as u32 + 1, attr) };
|
unsafe { LLVMAddAttributeAtIndex(fn_, i as u32 + 1, attr) };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let call_conv = if method.is_kernel {
|
let call_conv = if func_decl.name.is_kernel() {
|
||||||
Self::kernel_call_convention()
|
Self::kernel_call_convention()
|
||||||
} else {
|
} else {
|
||||||
Self::func_call_convention()
|
Self::func_call_convention()
|
||||||
@ -307,7 +282,7 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
|
|||||||
unsafe { LLVMAppendBasicBlockInContext(self.context, fn_, LLVM_UNNAMED.as_ptr()) };
|
unsafe { LLVMAppendBasicBlockInContext(self.context, fn_, LLVM_UNNAMED.as_ptr()) };
|
||||||
unsafe { LLVMPositionBuilderAtEnd(self.builder.get(), real_bb) };
|
unsafe { LLVMPositionBuilderAtEnd(self.builder.get(), real_bb) };
|
||||||
let mut method_emitter = MethodEmitContext::new(self, fn_, variables_builder);
|
let mut method_emitter = MethodEmitContext::new(self, fn_, variables_builder);
|
||||||
for var in method.return_arguments {
|
for var in func_decl.return_arguments {
|
||||||
method_emitter.emit_variable(var)?;
|
method_emitter.emit_variable(var)?;
|
||||||
}
|
}
|
||||||
for statement in statements.iter() {
|
for statement in statements.iter() {
|
||||||
@ -315,17 +290,6 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
|
|||||||
method_emitter.emit_label_initial(*label);
|
method_emitter.emit_label_initial(*label);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let mut statements = statements.into_iter();
|
|
||||||
if let Some(Statement::Label(label)) = statements.next() {
|
|
||||||
method_emitter.emit_label_delayed(label)?;
|
|
||||||
} else {
|
|
||||||
return Err(error_unreachable());
|
|
||||||
}
|
|
||||||
method_emitter.emit_kernel_rounding_prelude(
|
|
||||||
method.is_kernel,
|
|
||||||
method.rounding_mode_f32,
|
|
||||||
method.rounding_mode_f16f64,
|
|
||||||
)?;
|
|
||||||
for statement in statements {
|
for statement in statements {
|
||||||
method_emitter.emit_statement(statement)?;
|
method_emitter.emit_statement(statement)?;
|
||||||
}
|
}
|
||||||
@ -453,14 +417,6 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn llvm_ftz(ftz: bool) -> &'static str {
|
|
||||||
if ftz {
|
|
||||||
"preserve-sign"
|
|
||||||
} else {
|
|
||||||
"ieee"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_input_argument_type(
|
fn get_input_argument_type(
|
||||||
context: LLVMContextRef,
|
context: LLVMContextRef,
|
||||||
v_type: &ast::Type,
|
v_type: &ast::Type,
|
||||||
@ -517,32 +473,9 @@ impl<'a> MethodEmitContext<'a> {
|
|||||||
Statement::FunctionPointer(_) => todo!(),
|
Statement::FunctionPointer(_) => todo!(),
|
||||||
Statement::VectorRead(vector_read) => self.emit_vector_read(vector_read)?,
|
Statement::VectorRead(vector_read) => self.emit_vector_read(vector_read)?,
|
||||||
Statement::VectorWrite(vector_write) => self.emit_vector_write(vector_write)?,
|
Statement::VectorWrite(vector_write) => self.emit_vector_write(vector_write)?,
|
||||||
Statement::SetMode(mode_reg) => self.emit_set_mode(mode_reg)?,
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// This should be a kernel attribute, but sadly AMDGPU LLVM target does
|
|
||||||
// not support attribute for it. So we have to set it as the first
|
|
||||||
// instruction in the body of a kernel
|
|
||||||
fn emit_kernel_rounding_prelude(
|
|
||||||
&mut self,
|
|
||||||
is_kernel: bool,
|
|
||||||
rounding_mode_f32: ast::RoundingMode,
|
|
||||||
rounding_mode_f16f64: ast::RoundingMode,
|
|
||||||
) -> Result<(), TranslateError> {
|
|
||||||
if is_kernel {
|
|
||||||
if rounding_mode_f32 != ast::RoundingMode::NearestEven
|
|
||||||
|| rounding_mode_f16f64 != ast::RoundingMode::NearestEven
|
|
||||||
{
|
|
||||||
self.emit_set_mode(ModeRegister::Rounding {
|
|
||||||
f32: rounding_mode_f32,
|
|
||||||
f16f64: rounding_mode_f16f64,
|
|
||||||
})?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn emit_variable(&mut self, var: ast::Variable<SpirvWord>) -> Result<(), TranslateError> {
|
fn emit_variable(&mut self, var: ast::Variable<SpirvWord>) -> Result<(), TranslateError> {
|
||||||
let alloca = unsafe {
|
let alloca = unsafe {
|
||||||
LLVMZludaBuildAlloca(
|
LLVMZludaBuildAlloca(
|
||||||
@ -595,7 +528,6 @@ impl<'a> MethodEmitContext<'a> {
|
|||||||
ast::Instruction::Add { data, arguments } => self.emit_add(data, arguments),
|
ast::Instruction::Add { data, arguments } => self.emit_add(data, arguments),
|
||||||
ast::Instruction::St { data, arguments } => self.emit_st(data, arguments),
|
ast::Instruction::St { data, arguments } => self.emit_st(data, arguments),
|
||||||
ast::Instruction::Mul { data, arguments } => self.emit_mul(data, arguments),
|
ast::Instruction::Mul { data, arguments } => self.emit_mul(data, arguments),
|
||||||
ast::Instruction::Mul24 { data, arguments } => self.emit_mul24(data, arguments),
|
|
||||||
ast::Instruction::Setp { data, arguments } => self.emit_setp(data, arguments),
|
ast::Instruction::Setp { data, arguments } => self.emit_setp(data, arguments),
|
||||||
ast::Instruction::SetpBool { .. } => todo!(),
|
ast::Instruction::SetpBool { .. } => todo!(),
|
||||||
ast::Instruction::Not { data, arguments } => self.emit_not(data, arguments),
|
ast::Instruction::Not { data, arguments } => self.emit_not(data, arguments),
|
||||||
@ -1196,7 +1128,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||||||
let cos = self.emit_intrinsic(
|
let cos = self.emit_intrinsic(
|
||||||
c"llvm.cos.f32",
|
c"llvm.cos.f32",
|
||||||
Some(arguments.dst),
|
Some(arguments.dst),
|
||||||
Some(&ast::ScalarType::F32.into()),
|
&ast::ScalarType::F32.into(),
|
||||||
vec![(self.resolver.value(arguments.src)?, llvm_f32)],
|
vec![(self.resolver.value(arguments.src)?, llvm_f32)],
|
||||||
)?;
|
)?;
|
||||||
unsafe { LLVMZludaSetFastMathFlags(cos, LLVMZludaFastMathApproxFunc) }
|
unsafe { LLVMZludaSetFastMathFlags(cos, LLVMZludaFastMathApproxFunc) }
|
||||||
@ -1449,7 +1381,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||||||
let sin = self.emit_intrinsic(
|
let sin = self.emit_intrinsic(
|
||||||
c"llvm.sin.f32",
|
c"llvm.sin.f32",
|
||||||
Some(arguments.dst),
|
Some(arguments.dst),
|
||||||
Some(&ast::ScalarType::F32.into()),
|
&ast::ScalarType::F32.into(),
|
||||||
vec![(self.resolver.value(arguments.src)?, llvm_f32)],
|
vec![(self.resolver.value(arguments.src)?, llvm_f32)],
|
||||||
)?;
|
)?;
|
||||||
unsafe { LLVMZludaSetFastMathFlags(sin, LLVMZludaFastMathApproxFunc) }
|
unsafe { LLVMZludaSetFastMathFlags(sin, LLVMZludaFastMathApproxFunc) }
|
||||||
@ -1460,12 +1392,12 @@ impl<'a> MethodEmitContext<'a> {
|
|||||||
&mut self,
|
&mut self,
|
||||||
name: &CStr,
|
name: &CStr,
|
||||||
dst: Option<SpirvWord>,
|
dst: Option<SpirvWord>,
|
||||||
return_type: Option<&ast::Type>,
|
return_type: &ast::Type,
|
||||||
arguments: Vec<(LLVMValueRef, LLVMTypeRef)>,
|
arguments: Vec<(LLVMValueRef, LLVMTypeRef)>,
|
||||||
) -> Result<LLVMValueRef, TranslateError> {
|
) -> Result<LLVMValueRef, TranslateError> {
|
||||||
let fn_type = get_function_type(
|
let fn_type = get_function_type(
|
||||||
self.context,
|
self.context,
|
||||||
return_type.into_iter(),
|
iter::once(return_type),
|
||||||
arguments.iter().map(|(_, type_)| Ok(*type_)),
|
arguments.iter().map(|(_, type_)| Ok(*type_)),
|
||||||
)?;
|
)?;
|
||||||
let mut fn_ = unsafe { LLVMGetNamedFunction(self.module, name.as_ptr()) };
|
let mut fn_ = unsafe { LLVMGetNamedFunction(self.module, name.as_ptr()) };
|
||||||
@ -1626,7 +1558,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||||||
return self.emit_cvt_float_to_int(
|
return self.emit_cvt_float_to_int(
|
||||||
data.from,
|
data.from,
|
||||||
data.to,
|
data.to,
|
||||||
integer_rounding,
|
integer_rounding.unwrap_or(ast::RoundingMode::NearestEven),
|
||||||
arguments,
|
arguments,
|
||||||
Some(LLVMBuildFPToSI),
|
Some(LLVMBuildFPToSI),
|
||||||
)
|
)
|
||||||
@ -1684,7 +1616,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||||||
let clamped = self.emit_intrinsic(
|
let clamped = self.emit_intrinsic(
|
||||||
c"llvm.umin",
|
c"llvm.umin",
|
||||||
None,
|
None,
|
||||||
Some(&from.into()),
|
&from.into(),
|
||||||
vec![
|
vec![
|
||||||
(self.resolver.value(arguments.src)?, from_llvm),
|
(self.resolver.value(arguments.src)?, from_llvm),
|
||||||
(max, from_llvm),
|
(max, from_llvm),
|
||||||
@ -1714,7 +1646,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||||||
let zero_clamped = self.emit_intrinsic(
|
let zero_clamped = self.emit_intrinsic(
|
||||||
unsafe { CStr::from_bytes_with_nul_unchecked(zero_clamp_intrinsic.as_bytes()) },
|
unsafe { CStr::from_bytes_with_nul_unchecked(zero_clamp_intrinsic.as_bytes()) },
|
||||||
None,
|
None,
|
||||||
Some(&from.into()),
|
&from.into(),
|
||||||
vec![
|
vec![
|
||||||
(self.resolver.value(arguments.src)?, from_llvm),
|
(self.resolver.value(arguments.src)?, from_llvm),
|
||||||
(zero, from_llvm),
|
(zero, from_llvm),
|
||||||
@ -1733,7 +1665,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||||||
let fully_clamped = self.emit_intrinsic(
|
let fully_clamped = self.emit_intrinsic(
|
||||||
unsafe { CStr::from_bytes_with_nul_unchecked(max_clamp_intrinsic.as_bytes()) },
|
unsafe { CStr::from_bytes_with_nul_unchecked(max_clamp_intrinsic.as_bytes()) },
|
||||||
None,
|
None,
|
||||||
Some(&from.into()),
|
&from.into(),
|
||||||
vec![(zero_clamped, from_llvm), (max, from_llvm)],
|
vec![(zero_clamped, from_llvm), (max, from_llvm)],
|
||||||
)?;
|
)?;
|
||||||
let resize_fn = if to.layout().size() >= from.layout().size() {
|
let resize_fn = if to.layout().size() >= from.layout().size() {
|
||||||
@ -1773,7 +1705,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||||||
let rounded_float = self.emit_intrinsic(
|
let rounded_float = self.emit_intrinsic(
|
||||||
unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
|
unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
|
||||||
None,
|
None,
|
||||||
Some(&from.into()),
|
&from.into(),
|
||||||
vec![(
|
vec![(
|
||||||
self.resolver.value(arguments.src)?,
|
self.resolver.value(arguments.src)?,
|
||||||
get_scalar_type(self.context, from),
|
get_scalar_type(self.context, from),
|
||||||
@ -1842,7 +1774,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||||||
self.emit_intrinsic(
|
self.emit_intrinsic(
|
||||||
intrinsic,
|
intrinsic,
|
||||||
Some(arguments.dst),
|
Some(arguments.dst),
|
||||||
Some(&data.type_.into()),
|
&data.type_.into(),
|
||||||
vec![(self.resolver.value(arguments.src)?, type_)],
|
vec![(self.resolver.value(arguments.src)?, type_)],
|
||||||
)?;
|
)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -1863,7 +1795,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||||||
self.emit_intrinsic(
|
self.emit_intrinsic(
|
||||||
intrinsic,
|
intrinsic,
|
||||||
Some(arguments.dst),
|
Some(arguments.dst),
|
||||||
Some(&data.type_.into()),
|
&data.type_.into(),
|
||||||
vec![(self.resolver.value(arguments.src)?, type_)],
|
vec![(self.resolver.value(arguments.src)?, type_)],
|
||||||
)?;
|
)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -1885,7 +1817,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||||||
self.emit_intrinsic(
|
self.emit_intrinsic(
|
||||||
intrinsic,
|
intrinsic,
|
||||||
Some(arguments.dst),
|
Some(arguments.dst),
|
||||||
Some(&data.type_.into()),
|
&data.type_.into(),
|
||||||
vec![(self.resolver.value(arguments.src)?, type_)],
|
vec![(self.resolver.value(arguments.src)?, type_)],
|
||||||
)?;
|
)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -2007,7 +1939,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||||||
self.emit_intrinsic(
|
self.emit_intrinsic(
|
||||||
intrinsic,
|
intrinsic,
|
||||||
Some(arguments.dst),
|
Some(arguments.dst),
|
||||||
Some(&data.type_.into()),
|
&data.type_.into(),
|
||||||
vec![(
|
vec![(
|
||||||
self.resolver.value(arguments.src)?,
|
self.resolver.value(arguments.src)?,
|
||||||
get_scalar_type(self.context, data.type_),
|
get_scalar_type(self.context, data.type_),
|
||||||
@ -2024,7 +1956,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||||||
self.emit_intrinsic(
|
self.emit_intrinsic(
|
||||||
c"llvm.amdgcn.log.f32",
|
c"llvm.amdgcn.log.f32",
|
||||||
Some(arguments.dst),
|
Some(arguments.dst),
|
||||||
Some(&ast::ScalarType::F32.into()),
|
&ast::ScalarType::F32.into(),
|
||||||
vec![(
|
vec![(
|
||||||
self.resolver.value(arguments.src)?,
|
self.resolver.value(arguments.src)?,
|
||||||
get_scalar_type(self.context, ast::ScalarType::F32.into()),
|
get_scalar_type(self.context, ast::ScalarType::F32.into()),
|
||||||
@ -2079,7 +2011,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||||||
self.emit_intrinsic(
|
self.emit_intrinsic(
|
||||||
intrinsic,
|
intrinsic,
|
||||||
Some(arguments.dst),
|
Some(arguments.dst),
|
||||||
Some(&type_.into()),
|
&type_.into(),
|
||||||
vec![(self.resolver.value(arguments.src)?, llvm_type)],
|
vec![(self.resolver.value(arguments.src)?, llvm_type)],
|
||||||
)?;
|
)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -2103,7 +2035,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||||||
self.emit_intrinsic(
|
self.emit_intrinsic(
|
||||||
unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
|
unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
|
||||||
Some(arguments.dst),
|
Some(arguments.dst),
|
||||||
Some(&data.type_().into()),
|
&data.type_().into(),
|
||||||
vec![
|
vec![
|
||||||
(self.resolver.value(arguments.src1)?, llvm_type),
|
(self.resolver.value(arguments.src1)?, llvm_type),
|
||||||
(self.resolver.value(arguments.src2)?, llvm_type),
|
(self.resolver.value(arguments.src2)?, llvm_type),
|
||||||
@ -2130,7 +2062,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||||||
self.emit_intrinsic(
|
self.emit_intrinsic(
|
||||||
unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
|
unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
|
||||||
Some(arguments.dst),
|
Some(arguments.dst),
|
||||||
Some(&data.type_().into()),
|
&data.type_().into(),
|
||||||
vec![
|
vec![
|
||||||
(self.resolver.value(arguments.src1)?, llvm_type),
|
(self.resolver.value(arguments.src1)?, llvm_type),
|
||||||
(self.resolver.value(arguments.src2)?, llvm_type),
|
(self.resolver.value(arguments.src2)?, llvm_type),
|
||||||
@ -2148,7 +2080,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||||||
self.emit_intrinsic(
|
self.emit_intrinsic(
|
||||||
unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
|
unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
|
||||||
Some(arguments.dst),
|
Some(arguments.dst),
|
||||||
Some(&data.type_.into()),
|
&data.type_.into(),
|
||||||
vec![
|
vec![
|
||||||
(
|
(
|
||||||
self.resolver.value(arguments.src1)?,
|
self.resolver.value(arguments.src1)?,
|
||||||
@ -2269,118 +2201,12 @@ impl<'a> MethodEmitContext<'a> {
|
|||||||
self.emit_intrinsic(
|
self.emit_intrinsic(
|
||||||
unsafe { CStr::from_bytes_with_nul_unchecked(llvm_intrinsic.as_bytes()) },
|
unsafe { CStr::from_bytes_with_nul_unchecked(llvm_intrinsic.as_bytes()) },
|
||||||
Some(arguments.dst),
|
Some(arguments.dst),
|
||||||
Some(&data.type_.into()),
|
&data.type_.into(),
|
||||||
intrinsic_arguments,
|
intrinsic_arguments,
|
||||||
)?;
|
)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn emit_mul24(
|
|
||||||
&mut self,
|
|
||||||
data: ast::Mul24Details,
|
|
||||||
arguments: ast::Mul24Args<SpirvWord>,
|
|
||||||
) -> Result<(), TranslateError> {
|
|
||||||
let src1 = self.resolver.value(arguments.src1)?;
|
|
||||||
let src2 = self.resolver.value(arguments.src2)?;
|
|
||||||
let name_lo = match data.type_ {
|
|
||||||
ast::ScalarType::U32 => c"llvm.amdgcn.mul.u24",
|
|
||||||
ast::ScalarType::S32 => c"llvm.amdgcn.mul.i24",
|
|
||||||
_ => return Err(error_unreachable()),
|
|
||||||
};
|
|
||||||
let res_lo = self.emit_intrinsic(
|
|
||||||
name_lo,
|
|
||||||
if data.control == Mul24Control::Lo { Some(arguments.dst) } else { None },
|
|
||||||
Some(&ast::Type::Scalar(data.type_)),
|
|
||||||
vec![
|
|
||||||
(src1, get_scalar_type(self.context, data.type_)),
|
|
||||||
(src2, get_scalar_type(self.context, data.type_)),
|
|
||||||
],
|
|
||||||
)?;
|
|
||||||
if data.control == Mul24Control::Hi {
|
|
||||||
// There is an important difference between NVIDIA's mul24.hi and AMD's mulhi.[ui]24.
|
|
||||||
// NVIDIA: Returns bits 47..16 of the 64-bit result
|
|
||||||
// AMD: Returns bits 63..32 of the 64-bit result
|
|
||||||
// Hence we need to compute both hi and lo, shift the results and add them together to replicate NVIDIA's mul24
|
|
||||||
let name_hi = match data.type_ {
|
|
||||||
ast::ScalarType::U32 => c"llvm.amdgcn.mulhi.u24",
|
|
||||||
ast::ScalarType::S32 => c"llvm.amdgcn.mulhi.i24",
|
|
||||||
_ => return Err(error_unreachable()),
|
|
||||||
};
|
|
||||||
let res_hi = self.emit_intrinsic(
|
|
||||||
name_hi,
|
|
||||||
None,
|
|
||||||
Some(&ast::Type::Scalar(data.type_)),
|
|
||||||
vec![
|
|
||||||
(src1, get_scalar_type(self.context, data.type_)),
|
|
||||||
(src2, get_scalar_type(self.context, data.type_)),
|
|
||||||
],
|
|
||||||
)?;
|
|
||||||
let shift_number = unsafe { LLVMConstInt(LLVMInt32TypeInContext(self.context), 16, 0) };
|
|
||||||
let res_lo_shr = unsafe {
|
|
||||||
LLVMBuildLShr(self.builder, res_lo, shift_number, LLVM_UNNAMED.as_ptr())
|
|
||||||
};
|
|
||||||
let res_hi_shl =
|
|
||||||
unsafe { LLVMBuildShl(self.builder, res_hi, shift_number, LLVM_UNNAMED.as_ptr()) };
|
|
||||||
|
|
||||||
self.resolver
|
|
||||||
.with_result(arguments.dst, |dst: *const i8| unsafe {
|
|
||||||
LLVMBuildOr(self.builder, res_lo_shr, res_hi_shl, dst)
|
|
||||||
});
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn emit_set_mode(&mut self, mode_reg: ModeRegister) -> Result<(), TranslateError> {
|
|
||||||
fn hwreg(reg: u32, offset: u32, size: u32) -> u32 {
|
|
||||||
reg | (offset << 6) | ((size - 1) << 11)
|
|
||||||
}
|
|
||||||
fn denormal_to_value(ftz: bool) -> u32 {
|
|
||||||
if ftz {
|
|
||||||
0
|
|
||||||
} else {
|
|
||||||
3
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn rounding_to_value(ftz: ast::RoundingMode) -> u32 {
|
|
||||||
match ftz {
|
|
||||||
ptx_parser::RoundingMode::NearestEven => 0,
|
|
||||||
ptx_parser::RoundingMode::Zero => 3,
|
|
||||||
ptx_parser::RoundingMode::NegativeInf => 2,
|
|
||||||
ptx_parser::RoundingMode::PositiveInf => 1,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn merge_regs(f32: u32, f16f64: u32) -> u32 {
|
|
||||||
f32 | f16f64 << 2
|
|
||||||
}
|
|
||||||
let intrinsic = c"llvm.amdgcn.s.setreg";
|
|
||||||
let (hwreg, value) = match mode_reg {
|
|
||||||
ModeRegister::Denormal { f32, f16f64 } => {
|
|
||||||
let hwreg = hwreg(1, 4, 4);
|
|
||||||
let f32 = denormal_to_value(f32);
|
|
||||||
let f16f64 = denormal_to_value(f16f64);
|
|
||||||
let value = merge_regs(f32, f16f64);
|
|
||||||
(hwreg, value)
|
|
||||||
}
|
|
||||||
ModeRegister::Rounding { f32, f16f64 } => {
|
|
||||||
let hwreg = hwreg(1, 0, 4);
|
|
||||||
let f32 = rounding_to_value(f32);
|
|
||||||
let f16f64 = rounding_to_value(f16f64);
|
|
||||||
let value = merge_regs(f32, f16f64);
|
|
||||||
(hwreg, value)
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let llvm_i32 = get_scalar_type(self.context, ast::ScalarType::B32);
|
|
||||||
let hwreg_llvm = unsafe { LLVMConstInt(llvm_i32, hwreg as _, 0) };
|
|
||||||
let value_llvm = unsafe { LLVMConstInt(llvm_i32, value as _, 0) };
|
|
||||||
self.emit_intrinsic(
|
|
||||||
intrinsic,
|
|
||||||
None,
|
|
||||||
None,
|
|
||||||
vec![(hwreg_llvm, llvm_i32), (value_llvm, llvm_i32)],
|
|
||||||
)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
// Currently unused, LLVM 18 (ROCm 6.2) does not support `llvm.set.rounding`
|
// Currently unused, LLVM 18 (ROCm 6.2) does not support `llvm.set.rounding`
|
||||||
// Should be available in LLVM 19
|
// Should be available in LLVM 19
|
||||||
|
@ -2,8 +2,8 @@ use super::*;
|
|||||||
|
|
||||||
pub(super) fn run<'a, 'input>(
|
pub(super) fn run<'a, 'input>(
|
||||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||||
directives: Vec<UnconditionalDirective>,
|
directives: Vec<UnconditionalDirective<'input>>,
|
||||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||||
directives
|
directives
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|directive| run_directive(resolver, directive))
|
.map(|directive| run_directive(resolver, directive))
|
||||||
@ -13,10 +13,11 @@ pub(super) fn run<'a, 'input>(
|
|||||||
fn run_directive<'input>(
|
fn run_directive<'input>(
|
||||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||||
directive: Directive2<
|
directive: Directive2<
|
||||||
|
'input,
|
||||||
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
|
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
|
||||||
ast::ParsedOperand<SpirvWord>,
|
ast::ParsedOperand<SpirvWord>,
|
||||||
>,
|
>,
|
||||||
) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||||
Ok(match directive {
|
Ok(match directive {
|
||||||
Directive2::Variable(linking, var) => Directive2::Variable(linking, var),
|
Directive2::Variable(linking, var) => Directive2::Variable(linking, var),
|
||||||
Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?),
|
Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?),
|
||||||
@ -26,10 +27,11 @@ fn run_directive<'input>(
|
|||||||
fn run_method<'input>(
|
fn run_method<'input>(
|
||||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||||
method: Function2<
|
method: Function2<
|
||||||
|
'input,
|
||||||
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
|
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
|
||||||
ast::ParsedOperand<SpirvWord>,
|
ast::ParsedOperand<SpirvWord>,
|
||||||
>,
|
>,
|
||||||
) -> Result<Function2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
) -> Result<Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||||
let body = method
|
let body = method
|
||||||
.body
|
.body
|
||||||
.map(|statements| {
|
.map(|statements| {
|
||||||
@ -41,18 +43,12 @@ fn run_method<'input>(
|
|||||||
})
|
})
|
||||||
.transpose()?;
|
.transpose()?;
|
||||||
Ok(Function2 {
|
Ok(Function2 {
|
||||||
|
func_decl: method.func_decl,
|
||||||
|
globals: method.globals,
|
||||||
body,
|
body,
|
||||||
return_arguments: method.return_arguments,
|
|
||||||
name: method.name,
|
|
||||||
input_arguments: method.input_arguments,
|
|
||||||
import_as: method.import_as,
|
import_as: method.import_as,
|
||||||
tuning: method.tuning,
|
tuning: method.tuning,
|
||||||
linkage: method.linkage,
|
linkage: method.linkage,
|
||||||
is_kernel: method.is_kernel,
|
|
||||||
flush_to_zero_f32: method.flush_to_zero_f32,
|
|
||||||
flush_to_zero_f16f64: method.flush_to_zero_f16f64,
|
|
||||||
rounding_mode_f32: method.rounding_mode_f32,
|
|
||||||
rounding_mode_f16f64: method.rounding_mode_f16f64,
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,33 +1,30 @@
|
|||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
pub(super) fn run<'a, 'input>(
|
pub(super) fn run<'a, 'input>(
|
||||||
resolver: &'a mut GlobalStringIdentResolver2<'input>,
|
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||||
special_registers: &'a SpecialRegistersMap2,
|
special_registers: &'a SpecialRegistersMap2,
|
||||||
directives: Vec<UnconditionalDirective>,
|
directives: Vec<UnconditionalDirective<'input>>,
|
||||||
) -> Result<Vec<UnconditionalDirective>, TranslateError> {
|
) -> Result<Vec<UnconditionalDirective<'input>>, TranslateError> {
|
||||||
let mut result = Vec::with_capacity(SpecialRegistersMap2::len() + directives.len());
|
let declarations = SpecialRegistersMap2::generate_declarations(resolver);
|
||||||
|
let mut result = Vec::with_capacity(declarations.len() + directives.len());
|
||||||
let mut sreg_to_function =
|
let mut sreg_to_function =
|
||||||
FxHashMap::with_capacity_and_hasher(SpecialRegistersMap2::len(), Default::default());
|
FxHashMap::with_capacity_and_hasher(declarations.len(), Default::default());
|
||||||
SpecialRegistersMap2::foreach_declaration(
|
for (sreg, declaration) in declarations {
|
||||||
resolver,
|
let name = if let ast::MethodName::Func(name) = declaration.name {
|
||||||
|sreg, (return_arguments, name, input_arguments)| {
|
name
|
||||||
|
} else {
|
||||||
|
return Err(error_unreachable());
|
||||||
|
};
|
||||||
result.push(UnconditionalDirective::Method(UnconditionalFunction {
|
result.push(UnconditionalDirective::Method(UnconditionalFunction {
|
||||||
return_arguments,
|
func_decl: declaration,
|
||||||
name,
|
globals: Vec::new(),
|
||||||
input_arguments,
|
|
||||||
body: None,
|
body: None,
|
||||||
import_as: None,
|
import_as: None,
|
||||||
tuning: Vec::new(),
|
tuning: Vec::new(),
|
||||||
linkage: ast::LinkingDirective::EXTERN,
|
linkage: ast::LinkingDirective::EXTERN,
|
||||||
is_kernel: false,
|
|
||||||
flush_to_zero_f32: false,
|
|
||||||
flush_to_zero_f16f64: false,
|
|
||||||
rounding_mode_f32: ptx_parser::RoundingMode::NearestEven,
|
|
||||||
rounding_mode_f16f64: ptx_parser::RoundingMode::NearestEven,
|
|
||||||
}));
|
}));
|
||||||
sreg_to_function.insert(sreg, name);
|
sreg_to_function.insert(sreg, name);
|
||||||
},
|
}
|
||||||
);
|
|
||||||
let mut visitor = SpecialRegisterResolver {
|
let mut visitor = SpecialRegisterResolver {
|
||||||
resolver,
|
resolver,
|
||||||
special_registers,
|
special_registers,
|
||||||
@ -42,8 +39,8 @@ pub(super) fn run<'a, 'input>(
|
|||||||
|
|
||||||
fn run_directive<'a, 'input>(
|
fn run_directive<'a, 'input>(
|
||||||
visitor: &mut SpecialRegisterResolver<'a, 'input>,
|
visitor: &mut SpecialRegisterResolver<'a, 'input>,
|
||||||
directive: UnconditionalDirective,
|
directive: UnconditionalDirective<'input>,
|
||||||
) -> Result<UnconditionalDirective, TranslateError> {
|
) -> Result<UnconditionalDirective<'input>, TranslateError> {
|
||||||
Ok(match directive {
|
Ok(match directive {
|
||||||
var @ Directive2::Variable(..) => var,
|
var @ Directive2::Variable(..) => var,
|
||||||
Directive2::Method(method) => Directive2::Method(run_method(visitor, method)?),
|
Directive2::Method(method) => Directive2::Method(run_method(visitor, method)?),
|
||||||
@ -52,8 +49,8 @@ fn run_directive<'a, 'input>(
|
|||||||
|
|
||||||
fn run_method<'a, 'input>(
|
fn run_method<'a, 'input>(
|
||||||
visitor: &mut SpecialRegisterResolver<'a, 'input>,
|
visitor: &mut SpecialRegisterResolver<'a, 'input>,
|
||||||
method: UnconditionalFunction,
|
method: UnconditionalFunction<'input>,
|
||||||
) -> Result<UnconditionalFunction, TranslateError> {
|
) -> Result<UnconditionalFunction<'input>, TranslateError> {
|
||||||
let body = method
|
let body = method
|
||||||
.body
|
.body
|
||||||
.map(|statements| {
|
.map(|statements| {
|
||||||
@ -64,7 +61,14 @@ fn run_method<'a, 'input>(
|
|||||||
Ok::<_, TranslateError>(result)
|
Ok::<_, TranslateError>(result)
|
||||||
})
|
})
|
||||||
.transpose()?;
|
.transpose()?;
|
||||||
Ok(Function2 { body, ..method })
|
Ok(Function2 {
|
||||||
|
func_decl: method.func_decl,
|
||||||
|
globals: method.globals,
|
||||||
|
body,
|
||||||
|
import_as: method.import_as,
|
||||||
|
tuning: method.tuning,
|
||||||
|
linkage: method.linkage,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn run_statement<'a, 'input>(
|
fn run_statement<'a, 'input>(
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
pub(super) fn run<'input>(
|
pub(super) fn run<'input>(
|
||||||
directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||||
let mut result = Vec::with_capacity(directives.len());
|
let mut result = Vec::with_capacity(directives.len());
|
||||||
for mut directive in directives.into_iter() {
|
for mut directive in directives.into_iter() {
|
||||||
run_directive(&mut result, &mut directive)?;
|
run_directive(&mut result, &mut directive)?;
|
||||||
@ -12,8 +12,8 @@ pub(super) fn run<'input>(
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn run_directive<'input>(
|
fn run_directive<'input>(
|
||||||
result: &mut Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
result: &mut Vec<Directive2<'input, ptx_parser::Instruction<SpirvWord>, SpirvWord>>,
|
||||||
directive: &mut Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
|
directive: &mut Directive2<'input, ptx_parser::Instruction<SpirvWord>, SpirvWord>,
|
||||||
) -> Result<(), TranslateError> {
|
) -> Result<(), TranslateError> {
|
||||||
match directive {
|
match directive {
|
||||||
Directive2::Variable(..) => {}
|
Directive2::Variable(..) => {}
|
||||||
@ -23,8 +23,8 @@ fn run_directive<'input>(
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn run_function<'input>(
|
fn run_function<'input>(
|
||||||
result: &mut Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
result: &mut Vec<Directive2<'input, ptx_parser::Instruction<SpirvWord>, SpirvWord>>,
|
||||||
function: &mut Function2<ast::Instruction<SpirvWord>, SpirvWord>,
|
function: &mut Function2<'input, ptx_parser::Instruction<SpirvWord>, SpirvWord>,
|
||||||
) {
|
) {
|
||||||
function.body = function.body.take().map(|statements| {
|
function.body = function.body.take().map(|statements| {
|
||||||
statements
|
statements
|
||||||
|
@ -11,8 +11,8 @@ use super::*;
|
|||||||
// pass, so we do nothing there
|
// pass, so we do nothing there
|
||||||
pub(super) fn run<'a, 'input>(
|
pub(super) fn run<'a, 'input>(
|
||||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||||
directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||||
directives
|
directives
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|directive| run_directive(resolver, directive))
|
.map(|directive| run_directive(resolver, directive))
|
||||||
@ -21,8 +21,8 @@ pub(super) fn run<'a, 'input>(
|
|||||||
|
|
||||||
fn run_directive<'a, 'input>(
|
fn run_directive<'a, 'input>(
|
||||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||||
directive: Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
|
directive: Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
|
||||||
) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||||
Ok(match directive {
|
Ok(match directive {
|
||||||
var @ Directive2::Variable(..) => var,
|
var @ Directive2::Variable(..) => var,
|
||||||
Directive2::Method(method) => {
|
Directive2::Method(method) => {
|
||||||
@ -34,11 +34,12 @@ fn run_directive<'a, 'input>(
|
|||||||
|
|
||||||
fn run_method<'a, 'input>(
|
fn run_method<'a, 'input>(
|
||||||
mut visitor: InsertMemSSAVisitor<'a, 'input>,
|
mut visitor: InsertMemSSAVisitor<'a, 'input>,
|
||||||
mut method: Function2<ast::Instruction<SpirvWord>, SpirvWord>,
|
method: Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
|
||||||
) -> Result<Function2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
) -> Result<Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||||
let is_kernel = method.is_kernel;
|
let mut func_decl = method.func_decl;
|
||||||
|
let is_kernel = func_decl.name.is_kernel();
|
||||||
if is_kernel {
|
if is_kernel {
|
||||||
for arg in method.input_arguments.iter_mut() {
|
for arg in func_decl.input_arguments.iter_mut() {
|
||||||
let old_name = arg.name;
|
let old_name = arg.name;
|
||||||
let old_space = arg.state_space;
|
let old_space = arg.state_space;
|
||||||
let new_space = ast::StateSpace::ParamEntry;
|
let new_space = ast::StateSpace::ParamEntry;
|
||||||
@ -50,10 +51,10 @@ fn run_method<'a, 'input>(
|
|||||||
arg.state_space = new_space;
|
arg.state_space = new_space;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
for arg in method.return_arguments.iter_mut() {
|
for arg in func_decl.return_arguments.iter_mut() {
|
||||||
visitor.visit_variable(arg)?;
|
visitor.visit_variable(arg)?;
|
||||||
}
|
}
|
||||||
let return_arguments = &method.return_arguments[..];
|
let return_arguments = &func_decl.return_arguments[..];
|
||||||
let body = method
|
let body = method
|
||||||
.body
|
.body
|
||||||
.map(move |statements| {
|
.map(move |statements| {
|
||||||
@ -64,7 +65,14 @@ fn run_method<'a, 'input>(
|
|||||||
Ok::<_, TranslateError>(result)
|
Ok::<_, TranslateError>(result)
|
||||||
})
|
})
|
||||||
.transpose()?;
|
.transpose()?;
|
||||||
Ok(Function2 { body, ..method })
|
Ok(Function2 {
|
||||||
|
func_decl: func_decl,
|
||||||
|
globals: method.globals,
|
||||||
|
body,
|
||||||
|
import_as: method.import_as,
|
||||||
|
tuning: method.tuning,
|
||||||
|
linkage: method.linkage,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn run_statement<'a, 'input>(
|
fn run_statement<'a, 'input>(
|
||||||
|
@ -19,8 +19,8 @@ use ptx_parser as ast;
|
|||||||
*/
|
*/
|
||||||
pub(super) fn run<'input>(
|
pub(super) fn run<'input>(
|
||||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||||
directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||||
directives
|
directives
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|directive| run_directive(resolver, directive))
|
.map(|directive| run_directive(resolver, directive))
|
||||||
@ -29,8 +29,8 @@ pub(super) fn run<'input>(
|
|||||||
|
|
||||||
fn run_directive<'a, 'input>(
|
fn run_directive<'a, 'input>(
|
||||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||||
directive: Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
|
directive: Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
|
||||||
) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||||
Ok(match directive {
|
Ok(match directive {
|
||||||
var @ Directive2::Variable(..) => var,
|
var @ Directive2::Variable(..) => var,
|
||||||
Directive2::Method(mut method) => {
|
Directive2::Method(mut method) => {
|
||||||
|
@ -1,29 +0,0 @@
|
|||||||
.version 6.5
|
|
||||||
.target sm_50
|
|
||||||
.address_size 64
|
|
||||||
|
|
||||||
.func use_modes();
|
|
||||||
|
|
||||||
.visible .entry kernel()
|
|
||||||
{
|
|
||||||
.reg .f32 temp;
|
|
||||||
|
|
||||||
add.rz.ftz.f32 temp, temp, temp;
|
|
||||||
call use_modes;
|
|
||||||
add.rp.ftz.f32 temp, temp, temp;
|
|
||||||
ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
.func use_modes()
|
|
||||||
{
|
|
||||||
.reg .f32 temp;
|
|
||||||
.reg .pred pred;
|
|
||||||
@pred bra SET_RM;
|
|
||||||
@!pred bra SET_RZ;
|
|
||||||
SET_RM:
|
|
||||||
add.rm.f32 temp, temp, temp;
|
|
||||||
ret;
|
|
||||||
SET_RZ:
|
|
||||||
add.rz.f32 temp, temp, temp;
|
|
||||||
ret;
|
|
||||||
}
|
|
@ -1,15 +0,0 @@
|
|||||||
.version 6.5
|
|
||||||
.target sm_30
|
|
||||||
.address_size 64
|
|
||||||
|
|
||||||
.visible .entry add()
|
|
||||||
{
|
|
||||||
.reg .f32 temp<3>;
|
|
||||||
|
|
||||||
add.ftz.f16 temp2, temp1, temp0;
|
|
||||||
add.ftz.f32 temp2, temp1, temp0;
|
|
||||||
|
|
||||||
add.f16 temp2, temp1, temp0;
|
|
||||||
add.f32 temp2, temp1, temp0;
|
|
||||||
ret;
|
|
||||||
}
|
|
File diff suppressed because it is too large
Load Diff
@ -1,399 +0,0 @@
|
|||||||
use super::*;
|
|
||||||
use int_enum::IntEnum;
|
|
||||||
use strum::EnumCount;
|
|
||||||
|
|
||||||
#[repr(usize)]
|
|
||||||
#[derive(IntEnum, Eq, PartialEq, Copy, Clone, Debug)]
|
|
||||||
enum Bool {
|
|
||||||
False = 0,
|
|
||||||
True = 1,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn ftz() -> InstructionModes {
|
|
||||||
InstructionModes {
|
|
||||||
denormal_f32: Some(DenormalMode::FlushToZero),
|
|
||||||
denormal_f16f64: None,
|
|
||||||
rounding_f32: None,
|
|
||||||
rounding_f16f64: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn preserve() -> InstructionModes {
|
|
||||||
InstructionModes {
|
|
||||||
denormal_f32: Some(DenormalMode::Preserve),
|
|
||||||
denormal_f16f64: None,
|
|
||||||
rounding_f32: None,
|
|
||||||
rounding_f16f64: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn transitive_mixed() {
|
|
||||||
let mut graph = ControlFlowGraph::new();
|
|
||||||
let entry_id = SpirvWord(1);
|
|
||||||
let false_id = SpirvWord(2);
|
|
||||||
let empty_id = SpirvWord(3);
|
|
||||||
let false2_id = SpirvWord(4);
|
|
||||||
let entry = graph.add_entry_basic_block(entry_id);
|
|
||||||
graph.add_jump(entry, false_id);
|
|
||||||
let false_ = graph.get_or_add_basic_block(false_id);
|
|
||||||
graph.set_modes(false_, ftz(), ftz());
|
|
||||||
graph.add_jump(false_, empty_id);
|
|
||||||
let empty = graph.get_or_add_basic_block(empty_id);
|
|
||||||
graph.add_jump(empty, false2_id);
|
|
||||||
let false2_ = graph.get_or_add_basic_block(false2_id);
|
|
||||||
graph.set_modes(false2_, ftz(), ftz());
|
|
||||||
let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32);
|
|
||||||
assert_eq!(partial_result.bb_must_insert_mode.len(), 0);
|
|
||||||
assert_eq!(partial_result.bb_maybe_insert_mode.len(), 1);
|
|
||||||
assert_eq!(
|
|
||||||
partial_result.bb_maybe_insert_mode[&false_id],
|
|
||||||
(DenormalMode::FlushToZero, iter::once(entry_id).collect())
|
|
||||||
);
|
|
||||||
|
|
||||||
let result = optimize_mode_insertions::<DenormalMode, { DenormalMode::COUNT }>(partial_result);
|
|
||||||
assert_eq!(result.basic_blocks.len(), 0);
|
|
||||||
assert_eq!(result.kernels.len(), 1);
|
|
||||||
assert_eq!(result.kernels[&entry_id], DenormalMode::FlushToZero);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn transitive_change_twice() {
|
|
||||||
let mut graph = ControlFlowGraph::new();
|
|
||||||
let entry_id = SpirvWord(1);
|
|
||||||
let false_id = SpirvWord(2);
|
|
||||||
let empty_id = SpirvWord(3);
|
|
||||||
let true_id = SpirvWord(4);
|
|
||||||
let entry = graph.add_entry_basic_block(entry_id);
|
|
||||||
graph.add_jump(entry, false_id);
|
|
||||||
let false_ = graph.get_or_add_basic_block(false_id);
|
|
||||||
graph.set_modes(false_, ftz(), ftz());
|
|
||||||
graph.add_jump(false_, empty_id);
|
|
||||||
let empty = graph.get_or_add_basic_block(empty_id);
|
|
||||||
graph.add_jump(empty, true_id);
|
|
||||||
let true_ = graph.get_or_add_basic_block(true_id);
|
|
||||||
graph.set_modes(true_, preserve(), preserve());
|
|
||||||
let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32);
|
|
||||||
assert_eq!(partial_result.bb_must_insert_mode.len(), 1);
|
|
||||||
assert!(partial_result.bb_must_insert_mode.contains(&true_id));
|
|
||||||
assert_eq!(partial_result.bb_maybe_insert_mode.len(), 1);
|
|
||||||
assert_eq!(
|
|
||||||
partial_result.bb_maybe_insert_mode[&false_id],
|
|
||||||
(DenormalMode::FlushToZero, iter::once(entry_id).collect())
|
|
||||||
);
|
|
||||||
|
|
||||||
let result = optimize_mode_insertions::<DenormalMode, { DenormalMode::COUNT }>(partial_result);
|
|
||||||
assert_eq!(result.basic_blocks, iter::once(true_id).collect());
|
|
||||||
assert_eq!(result.kernels.len(), 1);
|
|
||||||
assert_eq!(result.kernels[&entry_id], DenormalMode::FlushToZero);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn transitive_change() {
|
|
||||||
let mut graph = ControlFlowGraph::new();
|
|
||||||
let entry_id = SpirvWord(1);
|
|
||||||
let empty_id = SpirvWord(2);
|
|
||||||
let true_id = SpirvWord(3);
|
|
||||||
let entry = graph.add_entry_basic_block(entry_id);
|
|
||||||
graph.add_jump(entry, empty_id);
|
|
||||||
let empty = graph.get_or_add_basic_block(empty_id);
|
|
||||||
graph.add_jump(empty, true_id);
|
|
||||||
let true_ = graph.get_or_add_basic_block(true_id);
|
|
||||||
graph.set_modes(true_, preserve(), preserve());
|
|
||||||
let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32);
|
|
||||||
assert_eq!(partial_result.bb_must_insert_mode.len(), 0);
|
|
||||||
assert_eq!(partial_result.bb_maybe_insert_mode.len(), 1);
|
|
||||||
assert_eq!(
|
|
||||||
partial_result.bb_maybe_insert_mode[&true_id],
|
|
||||||
(DenormalMode::Preserve, iter::once(entry_id).collect())
|
|
||||||
);
|
|
||||||
|
|
||||||
let result = optimize_mode_insertions::<DenormalMode, { DenormalMode::COUNT }>(partial_result);
|
|
||||||
assert_eq!(result.basic_blocks.len(), 0);
|
|
||||||
assert_eq!(result.kernels.len(), 1);
|
|
||||||
assert_eq!(result.kernels[&entry_id], DenormalMode::Preserve);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn codependency() {
|
|
||||||
let mut graph = ControlFlowGraph::new();
|
|
||||||
let entry_id = SpirvWord(1);
|
|
||||||
let left_f_id = SpirvWord(2);
|
|
||||||
let right_f_id = SpirvWord(3);
|
|
||||||
let left_none_id = SpirvWord(4);
|
|
||||||
let mid_none_id = SpirvWord(5);
|
|
||||||
let right_none_id = SpirvWord(6);
|
|
||||||
let entry = graph.add_entry_basic_block(entry_id);
|
|
||||||
graph.add_jump(entry, left_f_id);
|
|
||||||
graph.add_jump(entry, right_f_id);
|
|
||||||
let left_f = graph.get_or_add_basic_block(left_f_id);
|
|
||||||
graph.set_modes(left_f, ftz(), ftz());
|
|
||||||
let right_f = graph.get_or_add_basic_block(right_f_id);
|
|
||||||
graph.set_modes(right_f, ftz(), ftz());
|
|
||||||
graph.add_jump(left_f, left_none_id);
|
|
||||||
let left_none = graph.get_or_add_basic_block(left_none_id);
|
|
||||||
graph.add_jump(right_f, right_none_id);
|
|
||||||
let right_none = graph.get_or_add_basic_block(right_none_id);
|
|
||||||
graph.add_jump(left_none, mid_none_id);
|
|
||||||
graph.add_jump(right_none, mid_none_id);
|
|
||||||
let mid_none = graph.get_or_add_basic_block(mid_none_id);
|
|
||||||
graph.add_jump(mid_none, left_none_id);
|
|
||||||
graph.add_jump(mid_none, right_none_id);
|
|
||||||
//println!(
|
|
||||||
// "{:?}",
|
|
||||||
// petgraph::dot::Dot::with_config(&graph.graph, &[petgraph::dot::Config::EdgeNoLabel])
|
|
||||||
//);
|
|
||||||
let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32);
|
|
||||||
assert_eq!(partial_result.bb_must_insert_mode.len(), 0);
|
|
||||||
assert_eq!(partial_result.bb_maybe_insert_mode.len(), 2);
|
|
||||||
assert_eq!(
|
|
||||||
partial_result.bb_maybe_insert_mode[&left_f_id],
|
|
||||||
(DenormalMode::FlushToZero, iter::once(entry_id).collect())
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
partial_result.bb_maybe_insert_mode[&right_f_id],
|
|
||||||
(DenormalMode::FlushToZero, iter::once(entry_id).collect())
|
|
||||||
);
|
|
||||||
|
|
||||||
let result = optimize_mode_insertions::<DenormalMode, { DenormalMode::COUNT }>(partial_result);
|
|
||||||
assert_eq!(result.basic_blocks.len(), 0);
|
|
||||||
assert_eq!(result.kernels.len(), 1);
|
|
||||||
assert_eq!(result.kernels[&entry_id], DenormalMode::FlushToZero);
|
|
||||||
}
|
|
||||||
|
|
||||||
static FOLD_DENORMAL_PTX: &'static str = include_str!("fold_denormal.ptx");
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn fold_denormal() {
|
|
||||||
let method = compile_methods(FOLD_DENORMAL_PTX).pop().unwrap();
|
|
||||||
assert_eq!(true, method.flush_to_zero_f32);
|
|
||||||
assert_eq!(true, method.flush_to_zero_f16f64);
|
|
||||||
let method_body = method.body.unwrap();
|
|
||||||
assert!(matches!(
|
|
||||||
&*method_body,
|
|
||||||
[
|
|
||||||
Statement::Label(..),
|
|
||||||
Statement::Variable(..),
|
|
||||||
Statement::Variable(..),
|
|
||||||
Statement::Variable(..),
|
|
||||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
|
||||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
|
||||||
Statement::SetMode(ModeRegister::Denormal {
|
|
||||||
f32: false,
|
|
||||||
f16f64: false
|
|
||||||
}),
|
|
||||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
|
||||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
|
||||||
Statement::Instruction(ast::Instruction::Ret { .. }),
|
|
||||||
]
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
fn compile_methods(ptx: &str) -> Vec<Function2<ast::Instruction<SpirvWord>, SpirvWord>> {
|
|
||||||
use crate::pass::*;
|
|
||||||
|
|
||||||
let module = ptx_parser::parse_module_checked(ptx).unwrap();
|
|
||||||
let mut flat_resolver = GlobalStringIdentResolver2::new(SpirvWord(1));
|
|
||||||
let mut scoped_resolver = ScopedResolver::new(&mut flat_resolver);
|
|
||||||
let directives = normalize_identifiers2::run(&mut scoped_resolver, module.directives).unwrap();
|
|
||||||
let directives = normalize_predicates2::run(&mut flat_resolver, directives).unwrap();
|
|
||||||
let directives = expand_operands::run(&mut flat_resolver, directives).unwrap();
|
|
||||||
let directives = normalize_basic_blocks::run(&mut flat_resolver, directives).unwrap();
|
|
||||||
let directives = super::run(&mut flat_resolver, directives).unwrap();
|
|
||||||
directives
|
|
||||||
.into_iter()
|
|
||||||
.filter_map(|s| match s {
|
|
||||||
Directive2::Method(m) => Some(m),
|
|
||||||
_ => None,
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
}
|
|
||||||
|
|
||||||
static CALL_WITH_MODE_PTX: &'static str = include_str!("call_with_mode.ptx");
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn call_with_mode() {
|
|
||||||
let methods = compile_methods(CALL_WITH_MODE_PTX);
|
|
||||||
|
|
||||||
assert!(matches!(methods[0].body, None));
|
|
||||||
|
|
||||||
let method_1 = methods[1].body.as_ref().unwrap();
|
|
||||||
assert!(matches!(
|
|
||||||
&**method_1,
|
|
||||||
[
|
|
||||||
Statement::Label(..),
|
|
||||||
Statement::Variable(..),
|
|
||||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
|
||||||
Statement::Instruction(ast::Instruction::Call { .. }),
|
|
||||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
|
||||||
Statement::Label(..),
|
|
||||||
// Dual prelude
|
|
||||||
Statement::SetMode(ModeRegister::Denormal {
|
|
||||||
f32: true,
|
|
||||||
f16f64: true
|
|
||||||
}),
|
|
||||||
Statement::SetMode(ModeRegister::Rounding {
|
|
||||||
f32: ast::RoundingMode::PositiveInf,
|
|
||||||
f16f64: ast::RoundingMode::NearestEven
|
|
||||||
}),
|
|
||||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
|
||||||
// Denormal prelude
|
|
||||||
Statement::Label(..),
|
|
||||||
Statement::SetMode(ModeRegister::Denormal {
|
|
||||||
f32: true,
|
|
||||||
f16f64: true
|
|
||||||
}),
|
|
||||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
|
||||||
// Rounding prelude
|
|
||||||
Statement::Label(..),
|
|
||||||
Statement::SetMode(ModeRegister::Rounding {
|
|
||||||
f32: ast::RoundingMode::PositiveInf,
|
|
||||||
f16f64: ast::RoundingMode::NearestEven
|
|
||||||
}),
|
|
||||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
|
||||||
Statement::Label(..),
|
|
||||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
|
||||||
Statement::Instruction(ast::Instruction::Ret { .. }),
|
|
||||||
]
|
|
||||||
));
|
|
||||||
let [to_fn0] = calls(method_1);
|
|
||||||
let [_, dual_prelude, _, _, add] = labels(method_1);
|
|
||||||
let [post_call, post_prelude_dual, post_prelude_denormal, post_prelude_rounding] =
|
|
||||||
branches(method_1);
|
|
||||||
assert_eq!(methods[0].name, to_fn0);
|
|
||||||
assert_eq!(post_call, dual_prelude);
|
|
||||||
assert_eq!(post_prelude_dual, add);
|
|
||||||
assert_eq!(post_prelude_denormal, add);
|
|
||||||
assert_eq!(post_prelude_rounding, add);
|
|
||||||
|
|
||||||
let method_2 = methods[2].body.as_ref().unwrap();
|
|
||||||
assert!(matches!(
|
|
||||||
&**method_2,
|
|
||||||
[
|
|
||||||
Statement::Label(..),
|
|
||||||
Statement::Variable(..),
|
|
||||||
Statement::Variable(..),
|
|
||||||
Statement::Conditional(..),
|
|
||||||
Statement::Label(..),
|
|
||||||
Statement::Conditional(..),
|
|
||||||
Statement::Label(..),
|
|
||||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
|
||||||
Statement::Label(..),
|
|
||||||
// Dual prelude
|
|
||||||
Statement::SetMode(ModeRegister::Denormal {
|
|
||||||
f32: false,
|
|
||||||
f16f64: true
|
|
||||||
}),
|
|
||||||
Statement::SetMode(ModeRegister::Rounding {
|
|
||||||
f32: ast::RoundingMode::NegativeInf,
|
|
||||||
f16f64: ast::RoundingMode::NearestEven
|
|
||||||
}),
|
|
||||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
|
||||||
// Denormal prelude
|
|
||||||
Statement::Label(..),
|
|
||||||
Statement::SetMode(ModeRegister::Denormal {
|
|
||||||
f32: false,
|
|
||||||
f16f64: true
|
|
||||||
}),
|
|
||||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
|
||||||
// Rounding prelude
|
|
||||||
Statement::Label(..),
|
|
||||||
Statement::SetMode(ModeRegister::Rounding {
|
|
||||||
f32: ast::RoundingMode::NegativeInf,
|
|
||||||
f16f64: ast::RoundingMode::NearestEven
|
|
||||||
}),
|
|
||||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
|
||||||
Statement::Label(..),
|
|
||||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
|
||||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
|
||||||
Statement::Label(..),
|
|
||||||
Statement::SetMode(ModeRegister::Denormal {
|
|
||||||
f32: false,
|
|
||||||
f16f64: true
|
|
||||||
}),
|
|
||||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
|
||||||
Statement::Label(..),
|
|
||||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
|
||||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
|
||||||
Statement::Label(..),
|
|
||||||
Statement::Instruction(ast::Instruction::Ret { .. }),
|
|
||||||
]
|
|
||||||
));
|
|
||||||
let [(if_rm_true, if_rm_false), (if_rz_true, if_rz_false)] = conditionals(method_2);
|
|
||||||
let [_, conditional2, post_conditional2, prelude_dual, _, _, add1, add2_set_denormal, add2, ret] =
|
|
||||||
labels(method_2);
|
|
||||||
let [post_conditional2_jump, post_prelude_dual, post_prelude_denormal, post_prelude_rounding, post_add1, post_add2_set_denormal, post_add2] =
|
|
||||||
branches(method_2);
|
|
||||||
assert_eq!(if_rm_true, prelude_dual);
|
|
||||||
assert_eq!(if_rm_false, conditional2);
|
|
||||||
assert_eq!(if_rz_true, post_conditional2);
|
|
||||||
assert_eq!(if_rz_false, add2_set_denormal);
|
|
||||||
assert_eq!(post_conditional2_jump, prelude_dual);
|
|
||||||
assert_eq!(post_prelude_dual, add1);
|
|
||||||
assert_eq!(post_prelude_denormal, add1);
|
|
||||||
assert_eq!(post_prelude_rounding, add1);
|
|
||||||
assert_eq!(post_add1, ret);
|
|
||||||
assert_eq!(post_add2_set_denormal, add2);
|
|
||||||
assert_eq!(post_add2, ret);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn branches<const N: usize>(
|
|
||||||
fn_: &Vec<Statement<ast::Instruction<SpirvWord>, SpirvWord>>,
|
|
||||||
) -> [SpirvWord; N] {
|
|
||||||
fn_.iter()
|
|
||||||
.filter_map(|s| match s {
|
|
||||||
Statement::Instruction(ast::Instruction::Bra {
|
|
||||||
arguments: ast::BraArgs { src },
|
|
||||||
}) => Some(*src),
|
|
||||||
_ => None,
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
.try_into()
|
|
||||||
.unwrap()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn labels<const N: usize>(
|
|
||||||
fn_: &Vec<Statement<ast::Instruction<SpirvWord>, SpirvWord>>,
|
|
||||||
) -> [SpirvWord; N] {
|
|
||||||
fn_.iter()
|
|
||||||
.filter_map(
|
|
||||||
|s: &Statement<ptx_parser::Instruction<SpirvWord>, SpirvWord>| match s {
|
|
||||||
Statement::Label(label) => Some(*label),
|
|
||||||
_ => None,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
.try_into()
|
|
||||||
.unwrap()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn calls<const N: usize>(
|
|
||||||
fn_: &Vec<Statement<ast::Instruction<SpirvWord>, SpirvWord>>,
|
|
||||||
) -> [SpirvWord; N] {
|
|
||||||
fn_.iter()
|
|
||||||
.filter_map(|s| match s {
|
|
||||||
Statement::Instruction(ast::Instruction::Call {
|
|
||||||
arguments: ast::CallArgs { func, .. },
|
|
||||||
..
|
|
||||||
}) => Some(*func),
|
|
||||||
_ => None,
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
.try_into()
|
|
||||||
.unwrap()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn conditionals<const N: usize>(
|
|
||||||
fn_: &Vec<Statement<ast::Instruction<SpirvWord>, SpirvWord>>,
|
|
||||||
) -> [(SpirvWord, SpirvWord); N] {
|
|
||||||
fn_.iter()
|
|
||||||
.filter_map(|s| match s {
|
|
||||||
Statement::Conditional(BrachCondition {
|
|
||||||
if_true, if_false, ..
|
|
||||||
}) => Some((*if_true, *if_false)),
|
|
||||||
_ => None,
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
.try_into()
|
|
||||||
.unwrap()
|
|
||||||
}
|
|
@ -17,15 +17,12 @@ mod expand_operands;
|
|||||||
mod fix_special_registers2;
|
mod fix_special_registers2;
|
||||||
mod hoist_globals;
|
mod hoist_globals;
|
||||||
mod insert_explicit_load_store;
|
mod insert_explicit_load_store;
|
||||||
mod instruction_mode_to_global_mode;
|
|
||||||
mod insert_implicit_conversions2;
|
mod insert_implicit_conversions2;
|
||||||
mod normalize_basic_blocks;
|
|
||||||
mod normalize_identifiers2;
|
mod normalize_identifiers2;
|
||||||
mod normalize_predicates2;
|
mod normalize_predicates2;
|
||||||
mod remove_unreachable_basic_blocks;
|
|
||||||
mod replace_instructions_with_function_calls;
|
mod replace_instructions_with_function_calls;
|
||||||
mod replace_known_functions;
|
|
||||||
mod resolve_function_pointers;
|
mod resolve_function_pointers;
|
||||||
|
mod replace_known_functions;
|
||||||
|
|
||||||
static ZLUDA_PTX_IMPL: &'static [u8] = include_bytes!("../../lib/zluda_ptx_impl.bc");
|
static ZLUDA_PTX_IMPL: &'static [u8] = include_bytes!("../../lib/zluda_ptx_impl.bc");
|
||||||
const ZLUDA_PTX_PREFIX: &'static str = "__zluda_ptx_impl_";
|
const ZLUDA_PTX_PREFIX: &'static str = "__zluda_ptx_impl_";
|
||||||
@ -46,15 +43,12 @@ pub fn to_llvm_module<'input>(ast: ast::Module<'input>) -> Result<Module, Transl
|
|||||||
let mut scoped_resolver = ScopedResolver::new(&mut flat_resolver);
|
let mut scoped_resolver = ScopedResolver::new(&mut flat_resolver);
|
||||||
let sreg_map = SpecialRegistersMap2::new(&mut scoped_resolver)?;
|
let sreg_map = SpecialRegistersMap2::new(&mut scoped_resolver)?;
|
||||||
let directives = normalize_identifiers2::run(&mut scoped_resolver, ast.directives)?;
|
let directives = normalize_identifiers2::run(&mut scoped_resolver, ast.directives)?;
|
||||||
let directives = replace_known_functions::run(&mut flat_resolver, directives);
|
let directives = replace_known_functions::run(&flat_resolver, directives);
|
||||||
let directives = normalize_predicates2::run(&mut flat_resolver, directives)?;
|
let directives = normalize_predicates2::run(&mut flat_resolver, directives)?;
|
||||||
let directives = resolve_function_pointers::run(directives)?;
|
let directives = resolve_function_pointers::run(directives)?;
|
||||||
let directives = fix_special_registers2::run(&mut flat_resolver, &sreg_map, directives)?;
|
let directives: Vec<Directive2<'_, ptx_parser::Instruction<ptx_parser::ParsedOperand<SpirvWord>>, ptx_parser::ParsedOperand<SpirvWord>>> = fix_special_registers2::run(&mut flat_resolver, &sreg_map, directives)?;
|
||||||
let directives = expand_operands::run(&mut flat_resolver, directives)?;
|
let directives = expand_operands::run(&mut flat_resolver, directives)?;
|
||||||
let directives = deparamize_functions::run(&mut flat_resolver, directives)?;
|
let directives = deparamize_functions::run(&mut flat_resolver, directives)?;
|
||||||
let directives = normalize_basic_blocks::run(&mut flat_resolver, directives)?;
|
|
||||||
let directives = remove_unreachable_basic_blocks::run(directives)?;
|
|
||||||
let directives = instruction_mode_to_global_mode::run(&mut flat_resolver, directives)?;
|
|
||||||
let directives = insert_explicit_load_store::run(&mut flat_resolver, directives)?;
|
let directives = insert_explicit_load_store::run(&mut flat_resolver, directives)?;
|
||||||
let directives = insert_implicit_conversions2::run(&mut flat_resolver, directives)?;
|
let directives = insert_implicit_conversions2::run(&mut flat_resolver, directives)?;
|
||||||
let directives = replace_instructions_with_function_calls::run(&mut flat_resolver, directives)?;
|
let directives = replace_instructions_with_function_calls::run(&mut flat_resolver, directives)?;
|
||||||
@ -67,7 +61,7 @@ pub fn to_llvm_module<'input>(ast: ast::Module<'input>) -> Result<Module, Transl
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub struct Module {
|
pub struct Module {
|
||||||
pub llvm_ir: emit_llvm::Module,
|
pub llvm_ir: emit_llvm::MemoryBuffer,
|
||||||
pub kernel_info: HashMap<String, KernelInfo>,
|
pub kernel_info: HashMap<String, KernelInfo>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -201,20 +195,6 @@ enum Statement<I, P: ast::Operand> {
|
|||||||
FunctionPointer(FunctionPointerDetails),
|
FunctionPointer(FunctionPointerDetails),
|
||||||
VectorRead(VectorRead),
|
VectorRead(VectorRead),
|
||||||
VectorWrite(VectorWrite),
|
VectorWrite(VectorWrite),
|
||||||
SetMode(ModeRegister),
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Eq, PartialEq, Clone, Copy)]
|
|
||||||
#[cfg_attr(test, derive(Debug))]
|
|
||||||
enum ModeRegister {
|
|
||||||
Denormal {
|
|
||||||
f32: bool,
|
|
||||||
f16f64: bool,
|
|
||||||
},
|
|
||||||
Rounding {
|
|
||||||
f32: ast::RoundingMode,
|
|
||||||
f16f64: ast::RoundingMode,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: ast::Operand<Ident = SpirvWord>> Statement<ast::Instruction<T>, T> {
|
impl<T: ast::Operand<Ident = SpirvWord>> Statement<ast::Instruction<T>, T> {
|
||||||
@ -487,7 +467,6 @@ impl<T: ast::Operand<Ident = SpirvWord>> Statement<ast::Instruction<T>, T> {
|
|||||||
let src = visitor.visit_ident(src, None, false, false)?;
|
let src = visitor.visit_ident(src, None, false, false)?;
|
||||||
Statement::FunctionPointer(FunctionPointerDetails { dst, src })
|
Statement::FunctionPointer(FunctionPointerDetails { dst, src })
|
||||||
}
|
}
|
||||||
Statement::SetMode(mode_register) => Statement::SetMode(mode_register),
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -546,7 +525,7 @@ struct FunctionPointerDetails {
|
|||||||
src: SpirvWord,
|
src: SpirvWord,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Debug)]
|
#[derive(Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
|
||||||
pub struct SpirvWord(u32);
|
pub struct SpirvWord(u32);
|
||||||
|
|
||||||
impl From<u32> for SpirvWord {
|
impl From<u32> for SpirvWord {
|
||||||
@ -578,27 +557,22 @@ type NormalizedStatement = Statement<
|
|||||||
ast::ParsedOperand<SpirvWord>,
|
ast::ParsedOperand<SpirvWord>,
|
||||||
>;
|
>;
|
||||||
|
|
||||||
enum Directive2<Instruction, Operand: ast::Operand> {
|
enum Directive2<'input, Instruction, Operand: ast::Operand> {
|
||||||
Variable(ast::LinkingDirective, ast::Variable<SpirvWord>),
|
Variable(ast::LinkingDirective, ast::Variable<SpirvWord>),
|
||||||
Method(Function2<Instruction, Operand>),
|
Method(Function2<'input, Instruction, Operand>),
|
||||||
}
|
}
|
||||||
|
|
||||||
struct Function2<Instruction, Operand: ast::Operand> {
|
struct Function2<'input, Instruction, Operand: ast::Operand> {
|
||||||
pub return_arguments: Vec<ast::Variable<Operand::Ident>>,
|
pub func_decl: ast::MethodDeclaration<'input, SpirvWord>,
|
||||||
pub name: Operand::Ident,
|
pub globals: Vec<ast::Variable<SpirvWord>>,
|
||||||
pub input_arguments: Vec<ast::Variable<Operand::Ident>>,
|
|
||||||
pub body: Option<Vec<Statement<Instruction, Operand>>>,
|
pub body: Option<Vec<Statement<Instruction, Operand>>>,
|
||||||
is_kernel: bool,
|
|
||||||
import_as: Option<String>,
|
import_as: Option<String>,
|
||||||
tuning: Vec<ast::TuningDirective>,
|
tuning: Vec<ast::TuningDirective>,
|
||||||
linkage: ast::LinkingDirective,
|
linkage: ast::LinkingDirective,
|
||||||
flush_to_zero_f32: bool,
|
|
||||||
flush_to_zero_f16f64: bool,
|
|
||||||
rounding_mode_f32: ast::RoundingMode,
|
|
||||||
rounding_mode_f16f64: ast::RoundingMode,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type NormalizedDirective2 = Directive2<
|
type NormalizedDirective2<'input> = Directive2<
|
||||||
|
'input,
|
||||||
(
|
(
|
||||||
Option<ast::PredAt<SpirvWord>>,
|
Option<ast::PredAt<SpirvWord>>,
|
||||||
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
|
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
|
||||||
@ -606,7 +580,8 @@ type NormalizedDirective2 = Directive2<
|
|||||||
ast::ParsedOperand<SpirvWord>,
|
ast::ParsedOperand<SpirvWord>,
|
||||||
>;
|
>;
|
||||||
|
|
||||||
type NormalizedFunction2 = Function2<
|
type NormalizedFunction2<'input> = Function2<
|
||||||
|
'input,
|
||||||
(
|
(
|
||||||
Option<ast::PredAt<SpirvWord>>,
|
Option<ast::PredAt<SpirvWord>>,
|
||||||
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
|
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
|
||||||
@ -614,11 +589,17 @@ type NormalizedFunction2 = Function2<
|
|||||||
ast::ParsedOperand<SpirvWord>,
|
ast::ParsedOperand<SpirvWord>,
|
||||||
>;
|
>;
|
||||||
|
|
||||||
type UnconditionalDirective =
|
type UnconditionalDirective<'input> = Directive2<
|
||||||
Directive2<ast::Instruction<ast::ParsedOperand<SpirvWord>>, ast::ParsedOperand<SpirvWord>>;
|
'input,
|
||||||
|
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
|
||||||
|
ast::ParsedOperand<SpirvWord>,
|
||||||
|
>;
|
||||||
|
|
||||||
type UnconditionalFunction =
|
type UnconditionalFunction<'input> = Function2<
|
||||||
Function2<ast::Instruction<ast::ParsedOperand<SpirvWord>>, ast::ParsedOperand<SpirvWord>>;
|
'input,
|
||||||
|
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
|
||||||
|
ast::ParsedOperand<SpirvWord>,
|
||||||
|
>;
|
||||||
|
|
||||||
struct GlobalStringIdentResolver2<'input> {
|
struct GlobalStringIdentResolver2<'input> {
|
||||||
pub(crate) current_id: SpirvWord,
|
pub(crate) current_id: SpirvWord,
|
||||||
@ -824,45 +805,47 @@ impl SpecialRegistersMap2 {
|
|||||||
self.id_to_reg.get(&id).copied()
|
self.id_to_reg.get(&id).copied()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn len() -> usize {
|
fn generate_declarations<'a, 'input>(
|
||||||
PtxSpecialRegister::iter().len()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn foreach_declaration<'a, 'input>(
|
|
||||||
resolver: &'a mut GlobalStringIdentResolver2<'input>,
|
resolver: &'a mut GlobalStringIdentResolver2<'input>,
|
||||||
mut fn_: impl FnMut(
|
) -> impl ExactSizeIterator<
|
||||||
|
Item = (
|
||||||
PtxSpecialRegister,
|
PtxSpecialRegister,
|
||||||
(
|
ast::MethodDeclaration<'input, SpirvWord>,
|
||||||
Vec<ast::Variable<SpirvWord>>,
|
|
||||||
SpirvWord,
|
|
||||||
Vec<ast::Variable<SpirvWord>>,
|
|
||||||
),
|
),
|
||||||
),
|
> + 'a {
|
||||||
) {
|
PtxSpecialRegister::iter().map(|sreg| {
|
||||||
for sreg in PtxSpecialRegister::iter() {
|
|
||||||
let external_fn_name = [ZLUDA_PTX_PREFIX, sreg.get_unprefixed_function_name()].concat();
|
let external_fn_name = [ZLUDA_PTX_PREFIX, sreg.get_unprefixed_function_name()].concat();
|
||||||
let name = resolver.register_named(Cow::Owned(external_fn_name), None);
|
let name =
|
||||||
|
ast::MethodName::Func(resolver.register_named(Cow::Owned(external_fn_name), None));
|
||||||
let return_type = sreg.get_function_return_type();
|
let return_type = sreg.get_function_return_type();
|
||||||
let input_type = sreg.get_function_input_type();
|
let input_type = sreg.get_function_input_type();
|
||||||
let return_arguments = vec![ast::Variable {
|
(
|
||||||
|
sreg,
|
||||||
|
ast::MethodDeclaration {
|
||||||
|
return_arguments: vec![ast::Variable {
|
||||||
align: None,
|
align: None,
|
||||||
v_type: return_type.into(),
|
v_type: return_type.into(),
|
||||||
state_space: ast::StateSpace::Reg,
|
state_space: ast::StateSpace::Reg,
|
||||||
name: resolver.register_unnamed(Some((return_type.into(), ast::StateSpace::Reg))),
|
name: resolver
|
||||||
|
.register_unnamed(Some((return_type.into(), ast::StateSpace::Reg))),
|
||||||
array_init: Vec::new(),
|
array_init: Vec::new(),
|
||||||
}];
|
}],
|
||||||
let input_arguments = input_type
|
name: name,
|
||||||
|
input_arguments: input_type
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|type_| ast::Variable {
|
.map(|type_| ast::Variable {
|
||||||
align: None,
|
align: None,
|
||||||
v_type: type_.into(),
|
v_type: type_.into(),
|
||||||
state_space: ast::StateSpace::Reg,
|
state_space: ast::StateSpace::Reg,
|
||||||
name: resolver.register_unnamed(Some((type_.into(), ast::StateSpace::Reg))),
|
name: resolver
|
||||||
|
.register_unnamed(Some((type_.into(), ast::StateSpace::Reg))),
|
||||||
array_init: Vec::new(),
|
array_init: Vec::new(),
|
||||||
})
|
})
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>(),
|
||||||
fn_(sreg, (return_arguments, name, input_arguments));
|
shared_mem: None,
|
||||||
}
|
},
|
||||||
|
)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,134 +0,0 @@
|
|||||||
use super::*;
|
|
||||||
|
|
||||||
// This pass normalizes ptx modules in two ways that makes mode computation pass
|
|
||||||
// and code emissions passes much simpler:
|
|
||||||
// * Inserts label at the start of every function
|
|
||||||
// This makes control flow graph simpler in mode computation block: we can
|
|
||||||
// represent kernels as separate nodes with its own separate entry/exit mode
|
|
||||||
// * Inserts label at the start of every basic block
|
|
||||||
// * Insert explicit jumps before labels
|
|
||||||
// * Non-.entry methods get a single `ret;` exit point - this is because mode computation
|
|
||||||
// logic requires it. Control flow graph constructed by mode computation
|
|
||||||
// models function calls as jumps into and then from another function.
|
|
||||||
// If this cfg allowed multiple return basic blocks then there would be cases
|
|
||||||
// where we want to insert mode setting instruction along the edge between
|
|
||||||
// `ret;` and bb in the caller. This is only possible if there's a single
|
|
||||||
// edge between from function `ret;` and caller
|
|
||||||
pub(crate) fn run(
|
|
||||||
flat_resolver: &mut GlobalStringIdentResolver2<'_>,
|
|
||||||
mut directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
|
||||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
|
||||||
for directive in directives.iter_mut() {
|
|
||||||
let (body_ref, is_kernel) = match directive {
|
|
||||||
Directive2::Method(Function2 {
|
|
||||||
body: Some(body), is_kernel, ..
|
|
||||||
}) => (body, *is_kernel),
|
|
||||||
_ => continue,
|
|
||||||
};
|
|
||||||
let body = std::mem::replace(body_ref, Vec::new());
|
|
||||||
let mut result = Vec::with_capacity(body.len());
|
|
||||||
let mut previous_instruction_was_terminator = TerminatorKind::Not;
|
|
||||||
let mut body_iterator = body.into_iter();
|
|
||||||
let mut return_statements = Vec::new();
|
|
||||||
match body_iterator.next() {
|
|
||||||
Some(Statement::Label(_)) => {}
|
|
||||||
Some(statement) => {
|
|
||||||
result.push(Statement::Label(flat_resolver.register_unnamed(None)));
|
|
||||||
result.push(statement);
|
|
||||||
}
|
|
||||||
None => {}
|
|
||||||
}
|
|
||||||
for statement in body_iterator {
|
|
||||||
match previous_instruction_was_terminator {
|
|
||||||
TerminatorKind::Not => match statement {
|
|
||||||
Statement::Label(label) => {
|
|
||||||
result.push(Statement::Instruction(ast::Instruction::Bra {
|
|
||||||
arguments: ast::BraArgs { src: label },
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
},
|
|
||||||
TerminatorKind::Real => {
|
|
||||||
if !matches!(statement, Statement::Label(..)) {
|
|
||||||
result.push(Statement::Label(flat_resolver.register_unnamed(None)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
TerminatorKind::Fake => match statement {
|
|
||||||
// If there's a label after a call just reuse it
|
|
||||||
Statement::Label(label) => {
|
|
||||||
result.push(Statement::Instruction(ast::Instruction::Bra {
|
|
||||||
arguments: ast::BraArgs { src: label },
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
let label = flat_resolver.register_unnamed(None);
|
|
||||||
result.push(Statement::Instruction(ast::Instruction::Bra {
|
|
||||||
arguments: ast::BraArgs { src: label },
|
|
||||||
}));
|
|
||||||
result.push(Statement::Label(label));
|
|
||||||
}
|
|
||||||
},
|
|
||||||
}
|
|
||||||
match statement {
|
|
||||||
Statement::RetValue(..) => {
|
|
||||||
return Err(error_unreachable());
|
|
||||||
}
|
|
||||||
Statement::Instruction(ast::Instruction::Ret { .. }) => {
|
|
||||||
if !is_kernel {
|
|
||||||
return_statements.push(result.len());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
previous_instruction_was_terminator = is_block_terminator(&statement);
|
|
||||||
result.push(statement);
|
|
||||||
}
|
|
||||||
convert_from_multiple_returns_to_single_return(
|
|
||||||
flat_resolver,
|
|
||||||
&mut result,
|
|
||||||
return_statements,
|
|
||||||
)?;
|
|
||||||
*body_ref = result;
|
|
||||||
}
|
|
||||||
Ok(directives)
|
|
||||||
}
|
|
||||||
|
|
||||||
enum TerminatorKind {
|
|
||||||
Not,
|
|
||||||
Real,
|
|
||||||
Fake,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn convert_from_multiple_returns_to_single_return(
|
|
||||||
flat_resolver: &mut GlobalStringIdentResolver2<'_>,
|
|
||||||
result: &mut Vec<Statement<ptx_parser::Instruction<SpirvWord>, SpirvWord>>,
|
|
||||||
return_statements: Vec<usize>,
|
|
||||||
) -> Result<(), TranslateError> {
|
|
||||||
Ok(if return_statements.len() > 1 {
|
|
||||||
let ret_bb = flat_resolver.register_unnamed(None);
|
|
||||||
result.push(Statement::Label(ret_bb));
|
|
||||||
result.push(Statement::Instruction(ast::Instruction::Ret {
|
|
||||||
data: ast::RetData { uniform: false },
|
|
||||||
}));
|
|
||||||
for ret_index in return_statements {
|
|
||||||
let statement = result.get_mut(ret_index).ok_or_else(error_unreachable)?;
|
|
||||||
*statement = Statement::Instruction(ast::Instruction::Bra {
|
|
||||||
arguments: ast::BraArgs { src: ret_bb },
|
|
||||||
});
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn is_block_terminator(
|
|
||||||
statement: &Statement<ast::Instruction<SpirvWord>, SpirvWord>,
|
|
||||||
) -> TerminatorKind {
|
|
||||||
match statement {
|
|
||||||
Statement::Conditional(..)
|
|
||||||
| Statement::Instruction(ast::Instruction::Bra { .. })
|
|
||||||
// Normally call is not a terminator, but we treat it as such because it
|
|
||||||
// makes the "instruction modes to global modes" pass possible
|
|
||||||
| Statement::Instruction(ast::Instruction::Ret { .. }) => TerminatorKind::Real,
|
|
||||||
Statement::Instruction(ast::Instruction::Call { .. }) => TerminatorKind::Fake,
|
|
||||||
_ => TerminatorKind::Not,
|
|
||||||
}
|
|
||||||
}
|
|
@ -4,7 +4,7 @@ use ptx_parser as ast;
|
|||||||
pub(crate) fn run<'input, 'b>(
|
pub(crate) fn run<'input, 'b>(
|
||||||
resolver: &mut ScopedResolver<'input, 'b>,
|
resolver: &mut ScopedResolver<'input, 'b>,
|
||||||
directives: Vec<ast::Directive<'input, ast::ParsedOperand<&'input str>>>,
|
directives: Vec<ast::Directive<'input, ast::ParsedOperand<&'input str>>>,
|
||||||
) -> Result<Vec<NormalizedDirective2>, TranslateError> {
|
) -> Result<Vec<NormalizedDirective2<'input>>, TranslateError> {
|
||||||
resolver.start_scope();
|
resolver.start_scope();
|
||||||
let result = directives
|
let result = directives
|
||||||
.into_iter()
|
.into_iter()
|
||||||
@ -17,7 +17,7 @@ pub(crate) fn run<'input, 'b>(
|
|||||||
fn run_directive<'input, 'b>(
|
fn run_directive<'input, 'b>(
|
||||||
resolver: &mut ScopedResolver<'input, 'b>,
|
resolver: &mut ScopedResolver<'input, 'b>,
|
||||||
directive: ast::Directive<'input, ast::ParsedOperand<&'input str>>,
|
directive: ast::Directive<'input, ast::ParsedOperand<&'input str>>,
|
||||||
) -> Result<NormalizedDirective2, TranslateError> {
|
) -> Result<NormalizedDirective2<'input>, TranslateError> {
|
||||||
Ok(match directive {
|
Ok(match directive {
|
||||||
ast::Directive::Variable(linking, var) => {
|
ast::Directive::Variable(linking, var) => {
|
||||||
NormalizedDirective2::Variable(linking, run_variable(resolver, var)?)
|
NormalizedDirective2::Variable(linking, run_variable(resolver, var)?)
|
||||||
@ -32,11 +32,15 @@ fn run_method<'input, 'b>(
|
|||||||
resolver: &mut ScopedResolver<'input, 'b>,
|
resolver: &mut ScopedResolver<'input, 'b>,
|
||||||
linkage: ast::LinkingDirective,
|
linkage: ast::LinkingDirective,
|
||||||
method: ast::Function<'input, &'input str, ast::Statement<ast::ParsedOperand<&'input str>>>,
|
method: ast::Function<'input, &'input str, ast::Statement<ast::ParsedOperand<&'input str>>>,
|
||||||
) -> Result<NormalizedFunction2, TranslateError> {
|
) -> Result<NormalizedFunction2<'input>, TranslateError> {
|
||||||
let is_kernel = method.func_directive.name.is_kernel();
|
let name = match method.func_directive.name {
|
||||||
let name = resolver.add_or_get_in_current_scope_untyped(method.func_directive.name.text())?;
|
ast::MethodName::Kernel(name) => ast::MethodName::Kernel(name),
|
||||||
|
ast::MethodName::Func(text) => {
|
||||||
|
ast::MethodName::Func(resolver.add_or_get_in_current_scope_untyped(text)?)
|
||||||
|
}
|
||||||
|
};
|
||||||
resolver.start_scope();
|
resolver.start_scope();
|
||||||
let (return_arguments, input_arguments) = run_function_decl(resolver, method.func_directive)?;
|
let func_decl = run_function_decl(resolver, method.func_directive, name)?;
|
||||||
let body = method
|
let body = method
|
||||||
.body
|
.body
|
||||||
.map(|statements| {
|
.map(|statements| {
|
||||||
@ -47,25 +51,20 @@ fn run_method<'input, 'b>(
|
|||||||
.transpose()?;
|
.transpose()?;
|
||||||
resolver.end_scope();
|
resolver.end_scope();
|
||||||
Ok(Function2 {
|
Ok(Function2 {
|
||||||
return_arguments,
|
func_decl,
|
||||||
name,
|
globals: Vec::new(),
|
||||||
input_arguments,
|
|
||||||
body,
|
body,
|
||||||
import_as: None,
|
import_as: None,
|
||||||
linkage,
|
|
||||||
is_kernel,
|
|
||||||
tuning: method.tuning,
|
tuning: method.tuning,
|
||||||
flush_to_zero_f32: false,
|
linkage,
|
||||||
flush_to_zero_f16f64: false,
|
|
||||||
rounding_mode_f32: ptx_parser::RoundingMode::NearestEven,
|
|
||||||
rounding_mode_f16f64: ptx_parser::RoundingMode::NearestEven,
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn run_function_decl<'input, 'b>(
|
fn run_function_decl<'input, 'b>(
|
||||||
resolver: &mut ScopedResolver<'input, 'b>,
|
resolver: &mut ScopedResolver<'input, 'b>,
|
||||||
func_directive: ast::MethodDeclaration<'input, &'input str>,
|
func_directive: ast::MethodDeclaration<'input, &'input str>,
|
||||||
) -> Result<(Vec<ast::Variable<SpirvWord>>, Vec<ast::Variable<SpirvWord>>), TranslateError> {
|
name: ast::MethodName<'input, SpirvWord>,
|
||||||
|
) -> Result<ast::MethodDeclaration<'input, SpirvWord>, TranslateError> {
|
||||||
assert!(func_directive.shared_mem.is_none());
|
assert!(func_directive.shared_mem.is_none());
|
||||||
let return_arguments = func_directive
|
let return_arguments = func_directive
|
||||||
.return_arguments
|
.return_arguments
|
||||||
@ -77,7 +76,12 @@ fn run_function_decl<'input, 'b>(
|
|||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|var| run_variable(resolver, var))
|
.map(|var| run_variable(resolver, var))
|
||||||
.collect::<Result<Vec<_>, _>>()?;
|
.collect::<Result<Vec<_>, _>>()?;
|
||||||
Ok((return_arguments, input_arguments))
|
Ok(ast::MethodDeclaration {
|
||||||
|
return_arguments,
|
||||||
|
name,
|
||||||
|
input_arguments,
|
||||||
|
shared_mem: None,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn run_variable<'input, 'b>(
|
fn run_variable<'input, 'b>(
|
||||||
|
@ -3,8 +3,8 @@ use ptx_parser as ast;
|
|||||||
|
|
||||||
pub(crate) fn run<'input>(
|
pub(crate) fn run<'input>(
|
||||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||||
directives: Vec<NormalizedDirective2>,
|
directives: Vec<NormalizedDirective2<'input>>,
|
||||||
) -> Result<Vec<UnconditionalDirective>, TranslateError> {
|
) -> Result<Vec<UnconditionalDirective<'input>>, TranslateError> {
|
||||||
directives
|
directives
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|directive| run_directive(resolver, directive))
|
.map(|directive| run_directive(resolver, directive))
|
||||||
@ -13,8 +13,8 @@ pub(crate) fn run<'input>(
|
|||||||
|
|
||||||
fn run_directive<'input>(
|
fn run_directive<'input>(
|
||||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||||
directive: NormalizedDirective2,
|
directive: NormalizedDirective2<'input>,
|
||||||
) -> Result<UnconditionalDirective, TranslateError> {
|
) -> Result<UnconditionalDirective<'input>, TranslateError> {
|
||||||
Ok(match directive {
|
Ok(match directive {
|
||||||
Directive2::Variable(linking, var) => Directive2::Variable(linking, var),
|
Directive2::Variable(linking, var) => Directive2::Variable(linking, var),
|
||||||
Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?),
|
Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?),
|
||||||
@ -23,8 +23,8 @@ fn run_directive<'input>(
|
|||||||
|
|
||||||
fn run_method<'input>(
|
fn run_method<'input>(
|
||||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||||
method: NormalizedFunction2,
|
method: NormalizedFunction2<'input>,
|
||||||
) -> Result<UnconditionalFunction, TranslateError> {
|
) -> Result<UnconditionalFunction<'input>, TranslateError> {
|
||||||
let body = method
|
let body = method
|
||||||
.body
|
.body
|
||||||
.map(|statements| {
|
.map(|statements| {
|
||||||
@ -36,18 +36,12 @@ fn run_method<'input>(
|
|||||||
})
|
})
|
||||||
.transpose()?;
|
.transpose()?;
|
||||||
Ok(Function2 {
|
Ok(Function2 {
|
||||||
|
func_decl: method.func_decl,
|
||||||
|
globals: method.globals,
|
||||||
body,
|
body,
|
||||||
return_arguments: method.return_arguments,
|
|
||||||
name: method.name,
|
|
||||||
input_arguments: method.input_arguments,
|
|
||||||
import_as: method.import_as,
|
import_as: method.import_as,
|
||||||
tuning: method.tuning,
|
tuning: method.tuning,
|
||||||
linkage: method.linkage,
|
linkage: method.linkage,
|
||||||
is_kernel: method.is_kernel,
|
|
||||||
flush_to_zero_f32: method.flush_to_zero_f32,
|
|
||||||
flush_to_zero_f16f64: method.flush_to_zero_f16f64,
|
|
||||||
rounding_mode_f32: method.rounding_mode_f32,
|
|
||||||
rounding_mode_f16f64: method.rounding_mode_f16f64,
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,122 +0,0 @@
|
|||||||
use super::*;
|
|
||||||
use petgraph::{
|
|
||||||
graph::NodeIndex,
|
|
||||||
visit::{Bfs, VisitMap},
|
|
||||||
Graph,
|
|
||||||
};
|
|
||||||
use rustc_hash::FxHashSet;
|
|
||||||
|
|
||||||
pub(crate) fn run(
|
|
||||||
mut directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
|
||||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
|
||||||
let mut reachable_funcs = FxHashSet::default();
|
|
||||||
for directive in directives.iter_mut() {
|
|
||||||
match directive {
|
|
||||||
Directive2::Method(Function2 {
|
|
||||||
body: Some(body), ..
|
|
||||||
}) => {
|
|
||||||
let old_body = std::mem::replace(body, Vec::new());
|
|
||||||
let mut cfg = ControlFlowGraph::new();
|
|
||||||
let mut old_body_iter = old_body.iter();
|
|
||||||
let mut current_bb = match old_body_iter.next() {
|
|
||||||
Some(Statement::Label(label)) => cfg.add_or_get_node(*label),
|
|
||||||
_ => return Err(error_unreachable()),
|
|
||||||
};
|
|
||||||
let first_bb = current_bb;
|
|
||||||
for statement in old_body_iter {
|
|
||||||
match statement {
|
|
||||||
Statement::Label(label) => {
|
|
||||||
current_bb = cfg.add_or_get_node(*label);
|
|
||||||
}
|
|
||||||
Statement::Conditional(branch) => {
|
|
||||||
cfg.add_branch(current_bb, branch.if_true);
|
|
||||||
cfg.add_branch(current_bb, branch.if_false);
|
|
||||||
}
|
|
||||||
Statement::Instruction(ast::Instruction::Bra {
|
|
||||||
arguments: ast::BraArgs { src },
|
|
||||||
}) => {
|
|
||||||
cfg.add_branch(current_bb, *src);
|
|
||||||
}
|
|
||||||
Statement::FunctionPointer(FunctionPointerDetails {
|
|
||||||
src: _func, ..
|
|
||||||
}) => {
|
|
||||||
return Err(error_todo());
|
|
||||||
}
|
|
||||||
Statement::Instruction(ast::Instruction::Call {
|
|
||||||
arguments: ast::CallArgs { func, .. },
|
|
||||||
..
|
|
||||||
}) => {
|
|
||||||
reachable_funcs.insert(*func);
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
let mut bfs = Bfs::new(&cfg.graph, first_bb);
|
|
||||||
while let Some(_) = bfs.next(&cfg.graph) {}
|
|
||||||
let mut visited = true;
|
|
||||||
*body = try_filter_to_vec(old_body.into_iter(), |statement| {
|
|
||||||
match statement {
|
|
||||||
Statement::Label(label) => {
|
|
||||||
visited = bfs
|
|
||||||
.discovered
|
|
||||||
.is_visited(cfg.nodes.get(label).ok_or_else(error_unreachable)?);
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
Ok(visited)
|
|
||||||
})?;
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(directives
|
|
||||||
.into_iter()
|
|
||||||
.filter(|directive| match directive {
|
|
||||||
Directive2::Variable(..) => true,
|
|
||||||
Directive2::Method(Function2 {
|
|
||||||
name, is_kernel, ..
|
|
||||||
}) => *is_kernel || reachable_funcs.contains(name),
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn try_filter_to_vec<T, E>(
|
|
||||||
mut iter: impl ExactSizeIterator<Item = T>,
|
|
||||||
mut filter: impl FnMut(&T) -> Result<bool, E>,
|
|
||||||
) -> Result<Vec<T>, E> {
|
|
||||||
iter.try_fold(Vec::with_capacity(iter.len()), |mut vec, item| {
|
|
||||||
match filter(&item) {
|
|
||||||
Ok(true) => vec.push(item),
|
|
||||||
Ok(false) => {}
|
|
||||||
Err(err) => return Err(err),
|
|
||||||
}
|
|
||||||
Ok(vec)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ControlFlowGraph {
|
|
||||||
graph: Graph<SpirvWord, ()>,
|
|
||||||
nodes: FxHashMap<SpirvWord, NodeIndex>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ControlFlowGraph {
|
|
||||||
fn new() -> Self {
|
|
||||||
Self {
|
|
||||||
graph: Graph::new(),
|
|
||||||
nodes: FxHashMap::default(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn add_or_get_node(&mut self, id: SpirvWord) -> NodeIndex {
|
|
||||||
*self
|
|
||||||
.nodes
|
|
||||||
.entry(id)
|
|
||||||
.or_insert_with(|| self.graph.add_node(id))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn add_branch(&mut self, from: NodeIndex, to: SpirvWord) -> NodeIndex {
|
|
||||||
let to = self.add_or_get_node(to);
|
|
||||||
self.graph.add_edge(from, to, ());
|
|
||||||
to
|
|
||||||
}
|
|
||||||
}
|
|
@ -2,8 +2,8 @@ use super::*;
|
|||||||
|
|
||||||
pub(super) fn run<'input>(
|
pub(super) fn run<'input>(
|
||||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||||
directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||||
let mut fn_declarations = FxHashMap::default();
|
let mut fn_declarations = FxHashMap::default();
|
||||||
let remapped_directives = directives
|
let remapped_directives = directives
|
||||||
.into_iter()
|
.into_iter()
|
||||||
@ -13,18 +13,17 @@ pub(super) fn run<'input>(
|
|||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(_, (return_arguments, name, input_arguments))| {
|
.map(|(_, (return_arguments, name, input_arguments))| {
|
||||||
Directive2::Method(Function2 {
|
Directive2::Method(Function2 {
|
||||||
|
func_decl: ast::MethodDeclaration {
|
||||||
return_arguments,
|
return_arguments,
|
||||||
name: name,
|
name: ast::MethodName::Func(name),
|
||||||
input_arguments,
|
input_arguments,
|
||||||
|
shared_mem: None,
|
||||||
|
},
|
||||||
|
globals: Vec::new(),
|
||||||
body: None,
|
body: None,
|
||||||
import_as: None,
|
import_as: None,
|
||||||
tuning: Vec::new(),
|
tuning: Vec::new(),
|
||||||
linkage: ast::LinkingDirective::EXTERN,
|
linkage: ast::LinkingDirective::EXTERN,
|
||||||
is_kernel: false,
|
|
||||||
flush_to_zero_f32: false,
|
|
||||||
flush_to_zero_f16f64: false,
|
|
||||||
rounding_mode_f32: ptx_parser::RoundingMode::NearestEven,
|
|
||||||
rounding_mode_f16f64: ptx_parser::RoundingMode::NearestEven,
|
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
@ -42,8 +41,8 @@ fn run_directive<'input>(
|
|||||||
Vec<ast::Variable<SpirvWord>>,
|
Vec<ast::Variable<SpirvWord>>,
|
||||||
),
|
),
|
||||||
>,
|
>,
|
||||||
directive: Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
|
directive: Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
|
||||||
) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||||
Ok(match directive {
|
Ok(match directive {
|
||||||
var @ Directive2::Variable(..) => var,
|
var @ Directive2::Variable(..) => var,
|
||||||
Directive2::Method(mut method) => {
|
Directive2::Method(mut method) => {
|
||||||
|
@ -1,15 +1,14 @@
|
|||||||
use std::borrow::Cow;
|
|
||||||
|
|
||||||
use super::{GlobalStringIdentResolver2, NormalizedDirective2, SpirvWord};
|
use super::{GlobalStringIdentResolver2, NormalizedDirective2, SpirvWord};
|
||||||
|
|
||||||
pub(crate) fn run<'input>(
|
pub(crate) fn run<'input>(
|
||||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
resolver: &GlobalStringIdentResolver2<'input>,
|
||||||
mut directives: Vec<NormalizedDirective2>,
|
mut directives: Vec<NormalizedDirective2<'input>>,
|
||||||
) -> Vec<NormalizedDirective2> {
|
) -> Vec<NormalizedDirective2<'input>> {
|
||||||
for directive in directives.iter_mut() {
|
for directive in directives.iter_mut() {
|
||||||
match directive {
|
match directive {
|
||||||
NormalizedDirective2::Method(func) => {
|
NormalizedDirective2::Method(func) => {
|
||||||
replace_with_ptx_impl(resolver, func.name);
|
func.import_as =
|
||||||
|
replace_with_ptx_impl(resolver, &func.func_decl.name, func.import_as.take());
|
||||||
}
|
}
|
||||||
_ => {}
|
_ => {}
|
||||||
}
|
}
|
||||||
@ -18,16 +17,22 @@ pub(crate) fn run<'input>(
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn replace_with_ptx_impl<'input>(
|
fn replace_with_ptx_impl<'input>(
|
||||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
resolver: &GlobalStringIdentResolver2<'input>,
|
||||||
fn_name: SpirvWord,
|
fn_name: &ptx_parser::MethodName<'input, SpirvWord>,
|
||||||
) {
|
name: Option<String>,
|
||||||
|
) -> Option<String> {
|
||||||
let known_names = ["__assertfail"];
|
let known_names = ["__assertfail"];
|
||||||
if let Some(super::IdentEntry {
|
match name {
|
||||||
|
Some(name) if known_names.contains(&&*name) => Some(format!("__zluda_ptx_impl_{}", name)),
|
||||||
|
Some(name) => Some(name),
|
||||||
|
None => match fn_name {
|
||||||
|
ptx_parser::MethodName::Func(name) => match resolver.ident_map.get(name) {
|
||||||
|
Some(super::IdentEntry {
|
||||||
name: Some(name), ..
|
name: Some(name), ..
|
||||||
}) = resolver.ident_map.get_mut(&fn_name)
|
}) => Some(format!("__zluda_ptx_impl_{}", name)),
|
||||||
{
|
_ => None,
|
||||||
if known_names.contains(&&**name) {
|
},
|
||||||
*name = Cow::Owned(format!("__zluda_ptx_impl_{}", name));
|
ptx_parser::MethodName::Kernel(..) => None,
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,8 +3,8 @@ use ptx_parser as ast;
|
|||||||
use rustc_hash::FxHashSet;
|
use rustc_hash::FxHashSet;
|
||||||
|
|
||||||
pub(crate) fn run<'input>(
|
pub(crate) fn run<'input>(
|
||||||
directives: Vec<UnconditionalDirective>,
|
directives: Vec<UnconditionalDirective<'input>>,
|
||||||
) -> Result<Vec<UnconditionalDirective>, TranslateError> {
|
) -> Result<Vec<UnconditionalDirective<'input>>, TranslateError> {
|
||||||
let mut functions = FxHashSet::default();
|
let mut functions = FxHashSet::default();
|
||||||
directives
|
directives
|
||||||
.into_iter()
|
.into_iter()
|
||||||
@ -14,13 +14,19 @@ pub(crate) fn run<'input>(
|
|||||||
|
|
||||||
fn run_directive<'input>(
|
fn run_directive<'input>(
|
||||||
functions: &mut FxHashSet<SpirvWord>,
|
functions: &mut FxHashSet<SpirvWord>,
|
||||||
directive: UnconditionalDirective,
|
directive: UnconditionalDirective<'input>,
|
||||||
) -> Result<UnconditionalDirective, TranslateError> {
|
) -> Result<UnconditionalDirective<'input>, TranslateError> {
|
||||||
Ok(match directive {
|
Ok(match directive {
|
||||||
var @ Directive2::Variable(..) => var,
|
var @ Directive2::Variable(..) => var,
|
||||||
Directive2::Method(method) => {
|
Directive2::Method(method) => {
|
||||||
if !method.is_kernel {
|
{
|
||||||
functions.insert(method.name);
|
let func_decl = &method.func_decl;
|
||||||
|
match func_decl.name {
|
||||||
|
ptx_parser::MethodName::Kernel(_) => {}
|
||||||
|
ptx_parser::MethodName::Func(name) => {
|
||||||
|
functions.insert(name);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Directive2::Method(run_method(functions, method)?)
|
Directive2::Method(run_method(functions, method)?)
|
||||||
}
|
}
|
||||||
@ -29,8 +35,8 @@ fn run_directive<'input>(
|
|||||||
|
|
||||||
fn run_method<'input>(
|
fn run_method<'input>(
|
||||||
functions: &mut FxHashSet<SpirvWord>,
|
functions: &mut FxHashSet<SpirvWord>,
|
||||||
method: UnconditionalFunction,
|
method: UnconditionalFunction<'input>,
|
||||||
) -> Result<UnconditionalFunction, TranslateError> {
|
) -> Result<UnconditionalFunction<'input>, TranslateError> {
|
||||||
let body = method
|
let body = method
|
||||||
.body
|
.body
|
||||||
.map(|statements| {
|
.map(|statements| {
|
||||||
@ -40,7 +46,14 @@ fn run_method<'input>(
|
|||||||
.collect::<Result<Vec<_>, _>>()
|
.collect::<Result<Vec<_>, _>>()
|
||||||
})
|
})
|
||||||
.transpose()?;
|
.transpose()?;
|
||||||
Ok(Function2 { body, ..method })
|
Ok(Function2 {
|
||||||
|
func_decl: method.func_decl,
|
||||||
|
globals: method.globals,
|
||||||
|
body,
|
||||||
|
import_as: method.import_as,
|
||||||
|
tuning: method.tuning,
|
||||||
|
linkage: method.linkage,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn run_statement<'input>(
|
fn run_statement<'input>(
|
||||||
|
@ -1,24 +0,0 @@
|
|||||||
declare i32 @__zluda_ptx_impl_activemask() #0
|
|
||||||
|
|
||||||
define amdgpu_kernel void @activemask(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #1 {
|
|
||||||
%"31" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"32" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"28"
|
|
||||||
|
|
||||||
"28": ; preds = %1
|
|
||||||
%"33" = load i64, ptr addrspace(4) %"30", align 4
|
|
||||||
store i64 %"33", ptr addrspace(5) %"31", align 4
|
|
||||||
%"34" = call i32 @__zluda_ptx_impl_activemask()
|
|
||||||
store i32 %"34", ptr addrspace(5) %"32", align 4
|
|
||||||
%"35" = load i64, ptr addrspace(5) %"31", align 4
|
|
||||||
%"36" = load i32, ptr addrspace(5) %"32", align 4
|
|
||||||
%"37" = inttoptr i64 %"35" to ptr
|
|
||||||
store i32 %"36", ptr %"37", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,30 +0,0 @@
|
|||||||
define amdgpu_kernel void @add(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
|
|
||||||
%"34" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"35" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"36" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"37" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"31"
|
|
||||||
|
|
||||||
"31": ; preds = %1
|
|
||||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
|
||||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
|
||||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
|
||||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
|
||||||
%"41" = load i64, ptr addrspace(5) %"34", align 4
|
|
||||||
%"46" = inttoptr i64 %"41" to ptr
|
|
||||||
%"40" = load i64, ptr %"46", align 4
|
|
||||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
%"42" = add i64 %"43", 1
|
|
||||||
store i64 %"42", ptr addrspace(5) %"37", align 4
|
|
||||||
%"44" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
|
||||||
%"47" = inttoptr i64 %"44" to ptr
|
|
||||||
store i64 %"45", ptr %"47", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,52 +0,0 @@
|
|||||||
define amdgpu_kernel void @add_ftz(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
|
|
||||||
%"39" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"40" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"41" = alloca float, align 4, addrspace(5)
|
|
||||||
%"42" = alloca float, align 4, addrspace(5)
|
|
||||||
%"43" = alloca float, align 4, addrspace(5)
|
|
||||||
%"44" = alloca float, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"36"
|
|
||||||
|
|
||||||
"36": ; preds = %1
|
|
||||||
%"45" = load i64, ptr addrspace(4) %"37", align 4
|
|
||||||
store i64 %"45", ptr addrspace(5) %"39", align 4
|
|
||||||
%"46" = load i64, ptr addrspace(4) %"38", align 4
|
|
||||||
store i64 %"46", ptr addrspace(5) %"40", align 4
|
|
||||||
%"48" = load i64, ptr addrspace(5) %"39", align 4
|
|
||||||
%"61" = inttoptr i64 %"48" to ptr
|
|
||||||
%"47" = load float, ptr %"61", align 4
|
|
||||||
store float %"47", ptr addrspace(5) %"41", align 4
|
|
||||||
%"49" = load i64, ptr addrspace(5) %"39", align 4
|
|
||||||
%"62" = inttoptr i64 %"49" to ptr
|
|
||||||
%"33" = getelementptr inbounds i8, ptr %"62", i64 4
|
|
||||||
%"50" = load float, ptr %"33", align 4
|
|
||||||
store float %"50", ptr addrspace(5) %"42", align 4
|
|
||||||
%"52" = load float, ptr addrspace(5) %"41", align 4
|
|
||||||
%"53" = load float, ptr addrspace(5) %"42", align 4
|
|
||||||
%"51" = fadd float %"52", %"53"
|
|
||||||
store float %"51", ptr addrspace(5) %"43", align 4
|
|
||||||
call void @llvm.amdgcn.s.setreg(i32 6401, i32 3)
|
|
||||||
%"55" = load float, ptr addrspace(5) %"41", align 4
|
|
||||||
%"56" = load float, ptr addrspace(5) %"42", align 4
|
|
||||||
%"54" = fadd float %"55", %"56"
|
|
||||||
store float %"54", ptr addrspace(5) %"44", align 4
|
|
||||||
%"57" = load i64, ptr addrspace(5) %"40", align 4
|
|
||||||
%"58" = load float, ptr addrspace(5) %"43", align 4
|
|
||||||
%"63" = inttoptr i64 %"57" to ptr
|
|
||||||
store float %"58", ptr %"63", align 4
|
|
||||||
%"59" = load i64, ptr addrspace(5) %"40", align 4
|
|
||||||
%"64" = inttoptr i64 %"59" to ptr
|
|
||||||
%"35" = getelementptr inbounds i8, ptr %"64", i64 4
|
|
||||||
%"60" = load float, ptr addrspace(5) %"44", align 4
|
|
||||||
store float %"60", ptr %"35", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind willreturn
|
|
||||||
declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #1
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { nocallback nofree nosync nounwind willreturn }
|
|
@ -1,30 +0,0 @@
|
|||||||
define amdgpu_kernel void @add_non_coherent(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
|
|
||||||
%"34" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"35" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"36" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"37" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"31"
|
|
||||||
|
|
||||||
"31": ; preds = %1
|
|
||||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
|
||||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
|
||||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
|
||||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
|
||||||
%"41" = load i64, ptr addrspace(5) %"34", align 4
|
|
||||||
%"46" = inttoptr i64 %"41" to ptr addrspace(1)
|
|
||||||
%"40" = load i64, ptr addrspace(1) %"46", align 4
|
|
||||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
%"42" = add i64 %"43", 1
|
|
||||||
store i64 %"42", ptr addrspace(5) %"37", align 4
|
|
||||||
%"44" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
|
||||||
%"47" = inttoptr i64 %"44" to ptr addrspace(1)
|
|
||||||
store i64 %"45", ptr addrspace(1) %"47", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,30 +0,0 @@
|
|||||||
define amdgpu_kernel void @add_tuning(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
|
|
||||||
%"34" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"35" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"36" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"37" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"31"
|
|
||||||
|
|
||||||
"31": ; preds = %1
|
|
||||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
|
||||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
|
||||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
|
||||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
|
||||||
%"41" = load i64, ptr addrspace(5) %"34", align 4
|
|
||||||
%"46" = inttoptr i64 %"41" to ptr
|
|
||||||
%"40" = load i64, ptr %"46", align 4
|
|
||||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
%"42" = add i64 %"43", 1
|
|
||||||
store i64 %"42", ptr addrspace(5) %"37", align 4
|
|
||||||
%"44" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
|
||||||
%"47" = inttoptr i64 %"44" to ptr
|
|
||||||
store i64 %"45", ptr %"47", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,36 +0,0 @@
|
|||||||
define amdgpu_kernel void @and(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
|
|
||||||
%"35" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"36" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"37" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"38" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"32"
|
|
||||||
|
|
||||||
"32": ; preds = %1
|
|
||||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
|
||||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
|
||||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"50" = inttoptr i64 %"42" to ptr
|
|
||||||
%"41" = load i32, ptr %"50", align 4
|
|
||||||
store i32 %"41", ptr addrspace(5) %"37", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"51" = inttoptr i64 %"43" to ptr
|
|
||||||
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
|
|
||||||
%"44" = load i32, ptr %"31", align 4
|
|
||||||
store i32 %"44", ptr addrspace(5) %"38", align 4
|
|
||||||
%"46" = load i32, ptr addrspace(5) %"37", align 4
|
|
||||||
%"47" = load i32, ptr addrspace(5) %"38", align 4
|
|
||||||
%"52" = and i32 %"46", %"47"
|
|
||||||
store i32 %"52", ptr addrspace(5) %"37", align 4
|
|
||||||
%"48" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
%"49" = load i32, ptr addrspace(5) %"37", align 4
|
|
||||||
%"55" = inttoptr i64 %"48" to ptr
|
|
||||||
store i32 %"49", ptr %"55", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,46 +0,0 @@
|
|||||||
@shared_mem = external addrspace(3) global [1024 x i8], align 4
|
|
||||||
|
|
||||||
define amdgpu_kernel void @atom_add(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
|
|
||||||
%"38" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"39" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"40" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"41" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"35"
|
|
||||||
|
|
||||||
"35": ; preds = %1
|
|
||||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
|
||||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
|
||||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
|
||||||
%"45" = load i64, ptr addrspace(5) %"38", align 4
|
|
||||||
%"56" = inttoptr i64 %"45" to ptr
|
|
||||||
%"44" = load i32, ptr %"56", align 4
|
|
||||||
store i32 %"44", ptr addrspace(5) %"40", align 4
|
|
||||||
%"46" = load i64, ptr addrspace(5) %"38", align 4
|
|
||||||
%"57" = inttoptr i64 %"46" to ptr
|
|
||||||
%"32" = getelementptr inbounds i8, ptr %"57", i64 4
|
|
||||||
%"47" = load i32, ptr %"32", align 4
|
|
||||||
store i32 %"47", ptr addrspace(5) %"41", align 4
|
|
||||||
%"48" = load i32, ptr addrspace(5) %"40", align 4
|
|
||||||
store i32 %"48", ptr addrspace(3) @shared_mem, align 4
|
|
||||||
%"50" = load i32, ptr addrspace(5) %"41", align 4
|
|
||||||
%2 = atomicrmw add ptr addrspace(3) @shared_mem, i32 %"50" syncscope("agent-one-as") monotonic, align 4
|
|
||||||
store i32 %2, ptr addrspace(5) %"40", align 4
|
|
||||||
%"51" = load i32, ptr addrspace(3) @shared_mem, align 4
|
|
||||||
store i32 %"51", ptr addrspace(5) %"41", align 4
|
|
||||||
%"52" = load i64, ptr addrspace(5) %"39", align 4
|
|
||||||
%"53" = load i32, ptr addrspace(5) %"40", align 4
|
|
||||||
%"61" = inttoptr i64 %"52" to ptr
|
|
||||||
store i32 %"53", ptr %"61", align 4
|
|
||||||
%"54" = load i64, ptr addrspace(5) %"39", align 4
|
|
||||||
%"62" = inttoptr i64 %"54" to ptr
|
|
||||||
%"34" = getelementptr inbounds i8, ptr %"62", i64 4
|
|
||||||
%"55" = load i32, ptr addrspace(5) %"41", align 4
|
|
||||||
store i32 %"55", ptr %"34", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,46 +0,0 @@
|
|||||||
@shared_mem = external addrspace(3) global [1024 x i8], align 4
|
|
||||||
|
|
||||||
define amdgpu_kernel void @atom_add_float(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
|
|
||||||
%"38" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"39" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"40" = alloca float, align 4, addrspace(5)
|
|
||||||
%"41" = alloca float, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"35"
|
|
||||||
|
|
||||||
"35": ; preds = %1
|
|
||||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
|
||||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
|
||||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
|
||||||
%"45" = load i64, ptr addrspace(5) %"38", align 4
|
|
||||||
%"56" = inttoptr i64 %"45" to ptr
|
|
||||||
%"44" = load float, ptr %"56", align 4
|
|
||||||
store float %"44", ptr addrspace(5) %"40", align 4
|
|
||||||
%"46" = load i64, ptr addrspace(5) %"38", align 4
|
|
||||||
%"57" = inttoptr i64 %"46" to ptr
|
|
||||||
%"32" = getelementptr inbounds i8, ptr %"57", i64 4
|
|
||||||
%"47" = load float, ptr %"32", align 4
|
|
||||||
store float %"47", ptr addrspace(5) %"41", align 4
|
|
||||||
%"48" = load float, ptr addrspace(5) %"40", align 4
|
|
||||||
store float %"48", ptr addrspace(3) @shared_mem, align 4
|
|
||||||
%"50" = load float, ptr addrspace(5) %"41", align 4
|
|
||||||
%2 = atomicrmw fadd ptr addrspace(3) @shared_mem, float %"50" syncscope("agent-one-as") monotonic, align 4
|
|
||||||
store float %2, ptr addrspace(5) %"40", align 4
|
|
||||||
%"51" = load float, ptr addrspace(3) @shared_mem, align 4
|
|
||||||
store float %"51", ptr addrspace(5) %"41", align 4
|
|
||||||
%"52" = load i64, ptr addrspace(5) %"39", align 4
|
|
||||||
%"53" = load float, ptr addrspace(5) %"40", align 4
|
|
||||||
%"61" = inttoptr i64 %"52" to ptr
|
|
||||||
store float %"53", ptr %"61", align 4
|
|
||||||
%"54" = load i64, ptr addrspace(5) %"39", align 4
|
|
||||||
%"62" = inttoptr i64 %"54" to ptr
|
|
||||||
%"34" = getelementptr inbounds i8, ptr %"62", i64 4
|
|
||||||
%"55" = load float, ptr addrspace(5) %"41", align 4
|
|
||||||
store float %"55", ptr %"34", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,44 +0,0 @@
|
|||||||
define amdgpu_kernel void @atom_cas(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 {
|
|
||||||
%"40" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"41" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"42" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"43" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"37"
|
|
||||||
|
|
||||||
"37": ; preds = %1
|
|
||||||
%"44" = load i64, ptr addrspace(4) %"38", align 4
|
|
||||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
|
||||||
%"45" = load i64, ptr addrspace(4) %"39", align 4
|
|
||||||
store i64 %"45", ptr addrspace(5) %"41", align 4
|
|
||||||
%"47" = load i64, ptr addrspace(5) %"40", align 4
|
|
||||||
%"57" = inttoptr i64 %"47" to ptr
|
|
||||||
%"46" = load i32, ptr %"57", align 4
|
|
||||||
store i32 %"46", ptr addrspace(5) %"42", align 4
|
|
||||||
%"48" = load i64, ptr addrspace(5) %"40", align 4
|
|
||||||
%"58" = inttoptr i64 %"48" to ptr
|
|
||||||
%"31" = getelementptr inbounds i8, ptr %"58", i64 4
|
|
||||||
%"50" = load i32, ptr addrspace(5) %"42", align 4
|
|
||||||
%2 = cmpxchg ptr %"31", i32 %"50", i32 100 syncscope("agent-one-as") monotonic monotonic, align 4
|
|
||||||
%"59" = extractvalue { i32, i1 } %2, 0
|
|
||||||
store i32 %"59", ptr addrspace(5) %"42", align 4
|
|
||||||
%"51" = load i64, ptr addrspace(5) %"40", align 4
|
|
||||||
%"61" = inttoptr i64 %"51" to ptr
|
|
||||||
%"34" = getelementptr inbounds i8, ptr %"61", i64 4
|
|
||||||
%"52" = load i32, ptr %"34", align 4
|
|
||||||
store i32 %"52", ptr addrspace(5) %"43", align 4
|
|
||||||
%"53" = load i64, ptr addrspace(5) %"41", align 4
|
|
||||||
%"54" = load i32, ptr addrspace(5) %"42", align 4
|
|
||||||
%"62" = inttoptr i64 %"53" to ptr
|
|
||||||
store i32 %"54", ptr %"62", align 4
|
|
||||||
%"55" = load i64, ptr addrspace(5) %"41", align 4
|
|
||||||
%"63" = inttoptr i64 %"55" to ptr
|
|
||||||
%"36" = getelementptr inbounds i8, ptr %"63", i64 4
|
|
||||||
%"56" = load i32, ptr addrspace(5) %"43", align 4
|
|
||||||
store i32 %"56", ptr %"36", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,46 +0,0 @@
|
|||||||
define amdgpu_kernel void @atom_inc(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 {
|
|
||||||
%"40" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"41" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"42" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"43" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"44" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"37"
|
|
||||||
|
|
||||||
"37": ; preds = %1
|
|
||||||
%"45" = load i64, ptr addrspace(4) %"38", align 4
|
|
||||||
store i64 %"45", ptr addrspace(5) %"40", align 4
|
|
||||||
%"46" = load i64, ptr addrspace(4) %"39", align 4
|
|
||||||
store i64 %"46", ptr addrspace(5) %"41", align 4
|
|
||||||
%"48" = load i64, ptr addrspace(5) %"40", align 4
|
|
||||||
%"59" = inttoptr i64 %"48" to ptr
|
|
||||||
%2 = atomicrmw uinc_wrap ptr %"59", i32 101 syncscope("agent-one-as") monotonic, align 4
|
|
||||||
store i32 %2, ptr addrspace(5) %"42", align 4
|
|
||||||
%"50" = load i64, ptr addrspace(5) %"40", align 4
|
|
||||||
%"60" = inttoptr i64 %"50" to ptr addrspace(1)
|
|
||||||
%3 = atomicrmw uinc_wrap ptr addrspace(1) %"60", i32 101 syncscope("agent-one-as") monotonic, align 4
|
|
||||||
store i32 %3, ptr addrspace(5) %"43", align 4
|
|
||||||
%"52" = load i64, ptr addrspace(5) %"40", align 4
|
|
||||||
%"61" = inttoptr i64 %"52" to ptr
|
|
||||||
%"51" = load i32, ptr %"61", align 4
|
|
||||||
store i32 %"51", ptr addrspace(5) %"44", align 4
|
|
||||||
%"53" = load i64, ptr addrspace(5) %"41", align 4
|
|
||||||
%"54" = load i32, ptr addrspace(5) %"42", align 4
|
|
||||||
%"62" = inttoptr i64 %"53" to ptr
|
|
||||||
store i32 %"54", ptr %"62", align 4
|
|
||||||
%"55" = load i64, ptr addrspace(5) %"41", align 4
|
|
||||||
%"63" = inttoptr i64 %"55" to ptr
|
|
||||||
%"34" = getelementptr inbounds i8, ptr %"63", i64 4
|
|
||||||
%"56" = load i32, ptr addrspace(5) %"43", align 4
|
|
||||||
store i32 %"56", ptr %"34", align 4
|
|
||||||
%"57" = load i64, ptr addrspace(5) %"41", align 4
|
|
||||||
%"64" = inttoptr i64 %"57" to ptr
|
|
||||||
%"36" = getelementptr inbounds i8, ptr %"64", i64 8
|
|
||||||
%"58" = load i32, ptr addrspace(5) %"44", align 4
|
|
||||||
store i32 %"58", ptr %"36", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,30 +0,0 @@
|
|||||||
define amdgpu_kernel void @b64tof64(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
|
||||||
%"33" = alloca double, align 8, addrspace(5)
|
|
||||||
%"34" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"35" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"36" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"30"
|
|
||||||
|
|
||||||
"30": ; preds = %1
|
|
||||||
%"37" = load double, ptr addrspace(4) %"31", align 8
|
|
||||||
store double %"37", ptr addrspace(5) %"33", align 8
|
|
||||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
|
||||||
store i64 %"38", ptr addrspace(5) %"35", align 4
|
|
||||||
%"40" = load double, ptr addrspace(5) %"33", align 8
|
|
||||||
%"46" = bitcast double %"40" to i64
|
|
||||||
store i64 %"46", ptr addrspace(5) %"34", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(5) %"34", align 4
|
|
||||||
%"47" = inttoptr i64 %"42" to ptr
|
|
||||||
%"41" = load i64, ptr %"47", align 4
|
|
||||||
store i64 %"41", ptr addrspace(5) %"36", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"44" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
%"48" = inttoptr i64 %"43" to ptr
|
|
||||||
store i64 %"44", ptr %"48", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,91 +0,0 @@
|
|||||||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
|
||||||
|
|
||||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
|
||||||
|
|
||||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
|
||||||
|
|
||||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
|
||||||
|
|
||||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
|
||||||
|
|
||||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
|
||||||
|
|
||||||
define amdgpu_kernel void @bench(ptr addrspace(4) byref(i64) %"55", ptr addrspace(4) byref(i64) %"56") #1 {
|
|
||||||
%"57" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"58" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"59" = alloca float, align 4, addrspace(5)
|
|
||||||
%"60" = alloca float, align 4, addrspace(5)
|
|
||||||
%"61" = alloca float, align 4, addrspace(5)
|
|
||||||
%"62" = alloca float, align 4, addrspace(5)
|
|
||||||
%"63" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"64" = alloca i1, align 1, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"97"
|
|
||||||
|
|
||||||
"97": ; preds = %1
|
|
||||||
%"65" = load i64, ptr addrspace(4) %"55", align 4
|
|
||||||
store i64 %"65", ptr addrspace(5) %"57", align 4
|
|
||||||
%"66" = load i64, ptr addrspace(4) %"56", align 4
|
|
||||||
store i64 %"66", ptr addrspace(5) %"58", align 4
|
|
||||||
%"68" = load i64, ptr addrspace(5) %"57", align 4
|
|
||||||
%"91" = inttoptr i64 %"68" to ptr
|
|
||||||
%"67" = load float, ptr %"91", align 4
|
|
||||||
store float %"67", ptr addrspace(5) %"59", align 4
|
|
||||||
%"69" = load i64, ptr addrspace(5) %"57", align 4
|
|
||||||
%"92" = inttoptr i64 %"69" to ptr
|
|
||||||
%"39" = getelementptr inbounds i8, ptr %"92", i64 4
|
|
||||||
%"70" = load float, ptr %"39", align 4
|
|
||||||
store float %"70", ptr addrspace(5) %"60", align 4
|
|
||||||
%"71" = load i64, ptr addrspace(5) %"57", align 4
|
|
||||||
%"93" = inttoptr i64 %"71" to ptr
|
|
||||||
%"41" = getelementptr inbounds i8, ptr %"93", i64 8
|
|
||||||
%"72" = load float, ptr %"41", align 4
|
|
||||||
store float %"72", ptr addrspace(5) %"61", align 4
|
|
||||||
%"73" = load i64, ptr addrspace(5) %"57", align 4
|
|
||||||
%"94" = inttoptr i64 %"73" to ptr
|
|
||||||
%"43" = getelementptr inbounds i8, ptr %"94", i64 12
|
|
||||||
%"74" = load float, ptr %"43", align 4
|
|
||||||
store float %"74", ptr addrspace(5) %"62", align 4
|
|
||||||
store i32 0, ptr addrspace(5) %"63", align 4
|
|
||||||
br label %"10"
|
|
||||||
|
|
||||||
"10": ; preds = %"21", %"97"
|
|
||||||
%"77" = load float, ptr addrspace(5) %"59", align 4
|
|
||||||
%"78" = load float, ptr addrspace(5) %"60", align 4
|
|
||||||
call void asm sideeffect "s_denorm_mode 0", "~{mode}"()
|
|
||||||
%"76" = fmul float %"77", %"78"
|
|
||||||
store float %"76", ptr addrspace(5) %"59", align 4
|
|
||||||
%"80" = load float, ptr addrspace(5) %"61", align 4
|
|
||||||
%"81" = load float, ptr addrspace(5) %"62", align 4
|
|
||||||
call void asm sideeffect "s_denorm_mode 11", "~{mode}"()
|
|
||||||
%"79" = fmul float %"80", %"81"
|
|
||||||
store float %"79", ptr addrspace(5) %"61", align 4
|
|
||||||
%"83" = load i32, ptr addrspace(5) %"63", align 4
|
|
||||||
%"82" = add i32 %"83", 1
|
|
||||||
store i32 %"82", ptr addrspace(5) %"63", align 4
|
|
||||||
%"85" = load i32, ptr addrspace(5) %"63", align 4
|
|
||||||
%"84" = icmp eq i32 %"85", 100000000
|
|
||||||
store i1 %"84", ptr addrspace(5) %"64", align 1
|
|
||||||
%"86" = load i1, ptr addrspace(5) %"64", align 1
|
|
||||||
br i1 %"86", label %"11", label %"21"
|
|
||||||
|
|
||||||
"21": ; preds = %"10"
|
|
||||||
br label %"10"
|
|
||||||
|
|
||||||
"11": ; preds = %"10"
|
|
||||||
%"87" = load i64, ptr addrspace(5) %"58", align 4
|
|
||||||
%"88" = load float, ptr addrspace(5) %"59", align 4
|
|
||||||
%"95" = inttoptr i64 %"87" to ptr
|
|
||||||
store float %"88", ptr %"95", align 4
|
|
||||||
%"89" = load i64, ptr addrspace(5) %"58", align 4
|
|
||||||
%"96" = inttoptr i64 %"89" to ptr
|
|
||||||
%"48" = getelementptr inbounds i8, ptr %"96", i64 4
|
|
||||||
%"90" = load float, ptr addrspace(5) %"61", align 4
|
|
||||||
store float %"90", ptr %"48", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,46 +0,0 @@
|
|||||||
declare i32 @__zluda_ptx_impl_bfe_u32(i32, i32, i32) #0
|
|
||||||
|
|
||||||
define amdgpu_kernel void @bfe(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 {
|
|
||||||
%"38" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"39" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"40" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"41" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"42" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"35"
|
|
||||||
|
|
||||||
"35": ; preds = %1
|
|
||||||
%"43" = load i64, ptr addrspace(4) %"36", align 4
|
|
||||||
store i64 %"43", ptr addrspace(5) %"38", align 4
|
|
||||||
%"44" = load i64, ptr addrspace(4) %"37", align 4
|
|
||||||
store i64 %"44", ptr addrspace(5) %"39", align 4
|
|
||||||
%"46" = load i64, ptr addrspace(5) %"38", align 4
|
|
||||||
%"57" = inttoptr i64 %"46" to ptr
|
|
||||||
%"45" = load i32, ptr %"57", align 4
|
|
||||||
store i32 %"45", ptr addrspace(5) %"40", align 4
|
|
||||||
%"47" = load i64, ptr addrspace(5) %"38", align 4
|
|
||||||
%"58" = inttoptr i64 %"47" to ptr
|
|
||||||
%"32" = getelementptr inbounds i8, ptr %"58", i64 4
|
|
||||||
%"48" = load i32, ptr %"32", align 4
|
|
||||||
store i32 %"48", ptr addrspace(5) %"41", align 4
|
|
||||||
%"49" = load i64, ptr addrspace(5) %"38", align 4
|
|
||||||
%"59" = inttoptr i64 %"49" to ptr
|
|
||||||
%"34" = getelementptr inbounds i8, ptr %"59", i64 8
|
|
||||||
%"50" = load i32, ptr %"34", align 4
|
|
||||||
store i32 %"50", ptr addrspace(5) %"42", align 4
|
|
||||||
%"52" = load i32, ptr addrspace(5) %"40", align 4
|
|
||||||
%"53" = load i32, ptr addrspace(5) %"41", align 4
|
|
||||||
%"54" = load i32, ptr addrspace(5) %"42", align 4
|
|
||||||
%"51" = call i32 @__zluda_ptx_impl_bfe_u32(i32 %"52", i32 %"53", i32 %"54")
|
|
||||||
store i32 %"51", ptr addrspace(5) %"40", align 4
|
|
||||||
%"55" = load i64, ptr addrspace(5) %"39", align 4
|
|
||||||
%"56" = load i32, ptr addrspace(5) %"40", align 4
|
|
||||||
%"60" = inttoptr i64 %"55" to ptr
|
|
||||||
store i32 %"56", ptr %"60", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,53 +0,0 @@
|
|||||||
declare i32 @__zluda_ptx_impl_bfi_b32(i32, i32, i32, i32) #0
|
|
||||||
|
|
||||||
define amdgpu_kernel void @bfi(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 {
|
|
||||||
%"41" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"42" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"43" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"44" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"45" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"46" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"38"
|
|
||||||
|
|
||||||
"38": ; preds = %1
|
|
||||||
%"47" = load i64, ptr addrspace(4) %"39", align 4
|
|
||||||
store i64 %"47", ptr addrspace(5) %"41", align 4
|
|
||||||
%"48" = load i64, ptr addrspace(4) %"40", align 4
|
|
||||||
store i64 %"48", ptr addrspace(5) %"42", align 4
|
|
||||||
%"50" = load i64, ptr addrspace(5) %"41", align 4
|
|
||||||
%"64" = inttoptr i64 %"50" to ptr
|
|
||||||
%"49" = load i32, ptr %"64", align 4
|
|
||||||
store i32 %"49", ptr addrspace(5) %"43", align 4
|
|
||||||
%"51" = load i64, ptr addrspace(5) %"41", align 4
|
|
||||||
%"65" = inttoptr i64 %"51" to ptr
|
|
||||||
%"33" = getelementptr inbounds i8, ptr %"65", i64 4
|
|
||||||
%"52" = load i32, ptr %"33", align 4
|
|
||||||
store i32 %"52", ptr addrspace(5) %"44", align 4
|
|
||||||
%"53" = load i64, ptr addrspace(5) %"41", align 4
|
|
||||||
%"66" = inttoptr i64 %"53" to ptr
|
|
||||||
%"35" = getelementptr inbounds i8, ptr %"66", i64 8
|
|
||||||
%"54" = load i32, ptr %"35", align 4
|
|
||||||
store i32 %"54", ptr addrspace(5) %"45", align 4
|
|
||||||
%"55" = load i64, ptr addrspace(5) %"41", align 4
|
|
||||||
%"67" = inttoptr i64 %"55" to ptr
|
|
||||||
%"37" = getelementptr inbounds i8, ptr %"67", i64 12
|
|
||||||
%"56" = load i32, ptr %"37", align 4
|
|
||||||
store i32 %"56", ptr addrspace(5) %"46", align 4
|
|
||||||
%"58" = load i32, ptr addrspace(5) %"43", align 4
|
|
||||||
%"59" = load i32, ptr addrspace(5) %"44", align 4
|
|
||||||
%"60" = load i32, ptr addrspace(5) %"45", align 4
|
|
||||||
%"61" = load i32, ptr addrspace(5) %"46", align 4
|
|
||||||
%"68" = call i32 @__zluda_ptx_impl_bfi_b32(i32 %"58", i32 %"59", i32 %"60", i32 %"61")
|
|
||||||
store i32 %"68", ptr addrspace(5) %"43", align 4
|
|
||||||
%"62" = load i64, ptr addrspace(5) %"42", align 4
|
|
||||||
%"63" = load i32, ptr addrspace(5) %"43", align 4
|
|
||||||
%"71" = inttoptr i64 %"62" to ptr
|
|
||||||
store i32 %"63", ptr %"71", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,34 +0,0 @@
|
|||||||
define amdgpu_kernel void @block(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
|
|
||||||
%"36" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"37" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"38" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"39" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"46" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"33"
|
|
||||||
|
|
||||||
"33": ; preds = %1
|
|
||||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
|
||||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
|
||||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
|
||||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
%"51" = inttoptr i64 %"43" to ptr
|
|
||||||
%"42" = load i64, ptr %"51", align 4
|
|
||||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
|
||||||
%"45" = load i64, ptr addrspace(5) %"38", align 4
|
|
||||||
%"44" = add i64 %"45", 1
|
|
||||||
store i64 %"44", ptr addrspace(5) %"39", align 4
|
|
||||||
%"48" = load i64, ptr addrspace(5) %"46", align 4
|
|
||||||
%"47" = add i64 %"48", 1
|
|
||||||
store i64 %"47", ptr addrspace(5) %"46", align 4
|
|
||||||
%"49" = load i64, ptr addrspace(5) %"37", align 4
|
|
||||||
%"50" = load i64, ptr addrspace(5) %"39", align 4
|
|
||||||
%"52" = inttoptr i64 %"49" to ptr
|
|
||||||
store i64 %"50", ptr %"52", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,36 +0,0 @@
|
|||||||
define amdgpu_kernel void @bra(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
|
|
||||||
%"38" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"39" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"40" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"41" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"35"
|
|
||||||
|
|
||||||
"35": ; preds = %1
|
|
||||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
|
||||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
|
||||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
|
||||||
%"45" = load i64, ptr addrspace(5) %"38", align 4
|
|
||||||
%"50" = inttoptr i64 %"45" to ptr
|
|
||||||
%"44" = load i64, ptr %"50", align 4
|
|
||||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
|
||||||
br label %"10"
|
|
||||||
|
|
||||||
"10": ; preds = %"35"
|
|
||||||
%"47" = load i64, ptr addrspace(5) %"40", align 4
|
|
||||||
%"46" = add i64 %"47", 1
|
|
||||||
store i64 %"46", ptr addrspace(5) %"41", align 4
|
|
||||||
br label %"12"
|
|
||||||
|
|
||||||
"12": ; preds = %"10"
|
|
||||||
%"48" = load i64, ptr addrspace(5) %"39", align 4
|
|
||||||
%"49" = load i64, ptr addrspace(5) %"41", align 4
|
|
||||||
%"51" = inttoptr i64 %"48" to ptr
|
|
||||||
store i64 %"49", ptr %"51", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,33 +0,0 @@
|
|||||||
define amdgpu_kernel void @brev(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
|
||||||
%"32" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"29"
|
|
||||||
|
|
||||||
"29": ; preds = %1
|
|
||||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
|
||||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
|
||||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
|
||||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
|
||||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
|
||||||
%"43" = inttoptr i64 %"38" to ptr
|
|
||||||
%"37" = load i32, ptr %"43", align 4
|
|
||||||
store i32 %"37", ptr addrspace(5) %"34", align 4
|
|
||||||
%"40" = load i32, ptr addrspace(5) %"34", align 4
|
|
||||||
%"39" = call i32 @llvm.bitreverse.i32(i32 %"40")
|
|
||||||
store i32 %"39", ptr addrspace(5) %"34", align 4
|
|
||||||
%"41" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"42" = load i32, ptr addrspace(5) %"34", align 4
|
|
||||||
%"44" = inttoptr i64 %"41" to ptr
|
|
||||||
store i32 %"42", ptr %"44", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
||||||
declare i32 @llvm.bitreverse.i32(i32) #1
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
|
@ -1,64 +0,0 @@
|
|||||||
define i64 @incr(i64 %"43") #0 {
|
|
||||||
%"63" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"64" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"65" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"66" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"46"
|
|
||||||
|
|
||||||
"46": ; preds = %1
|
|
||||||
store i64 %"43", ptr addrspace(5) %"65", align 4
|
|
||||||
%"67" = load i64, ptr addrspace(5) %"65", align 4
|
|
||||||
store i64 %"67", ptr addrspace(5) %"66", align 4
|
|
||||||
%"69" = load i64, ptr addrspace(5) %"66", align 4
|
|
||||||
%"68" = add i64 %"69", 1
|
|
||||||
store i64 %"68", ptr addrspace(5) %"66", align 4
|
|
||||||
%"70" = load i64, ptr addrspace(5) %"66", align 4
|
|
||||||
store i64 %"70", ptr addrspace(5) %"64", align 4
|
|
||||||
%"71" = load i64, ptr addrspace(5) %"64", align 4
|
|
||||||
store i64 %"71", ptr addrspace(5) %"63", align 4
|
|
||||||
%2 = load i64, ptr addrspace(5) %"63", align 4
|
|
||||||
ret i64 %2
|
|
||||||
}
|
|
||||||
|
|
||||||
define amdgpu_kernel void @call(ptr addrspace(4) byref(i64) %"48", ptr addrspace(4) byref(i64) %"49") #1 {
|
|
||||||
%"50" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"51" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"52" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"57" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"58" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"44"
|
|
||||||
|
|
||||||
"44": ; preds = %1
|
|
||||||
%"53" = load i64, ptr addrspace(4) %"48", align 4
|
|
||||||
store i64 %"53", ptr addrspace(5) %"50", align 4
|
|
||||||
%"54" = load i64, ptr addrspace(4) %"49", align 4
|
|
||||||
store i64 %"54", ptr addrspace(5) %"51", align 4
|
|
||||||
%"56" = load i64, ptr addrspace(5) %"50", align 4
|
|
||||||
%"72" = inttoptr i64 %"56" to ptr addrspace(1)
|
|
||||||
%"55" = load i64, ptr addrspace(1) %"72", align 4
|
|
||||||
store i64 %"55", ptr addrspace(5) %"52", align 4
|
|
||||||
%"59" = load i64, ptr addrspace(5) %"52", align 4
|
|
||||||
store i64 %"59", ptr addrspace(5) %"57", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(5) %"57", align 4
|
|
||||||
%"41" = call i64 @incr(i64 %"40")
|
|
||||||
br label %"45"
|
|
||||||
|
|
||||||
"45": ; preds = %"44"
|
|
||||||
store i64 %"41", ptr addrspace(5) %"58", align 4
|
|
||||||
%"60" = load i64, ptr addrspace(5) %"58", align 4
|
|
||||||
store i64 %"60", ptr addrspace(5) %"52", align 4
|
|
||||||
%"61" = load i64, ptr addrspace(5) %"51", align 4
|
|
||||||
%"62" = load i64, ptr addrspace(5) %"52", align 4
|
|
||||||
%"75" = inttoptr i64 %"61" to ptr addrspace(1)
|
|
||||||
store i64 %"62", ptr addrspace(1) %"75", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,155 +0,0 @@
|
|||||||
define float @add_rm(float %"79", float %"80") #0 {
|
|
||||||
%"128" = alloca float, align 4, addrspace(5)
|
|
||||||
%"129" = alloca float, align 4, addrspace(5)
|
|
||||||
%"130" = alloca float, align 4, addrspace(5)
|
|
||||||
%"131" = alloca float, align 4, addrspace(5)
|
|
||||||
%"132" = alloca float, align 4, addrspace(5)
|
|
||||||
%"133" = alloca float, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"89"
|
|
||||||
|
|
||||||
"89": ; preds = %1
|
|
||||||
call void @llvm.amdgcn.s.setreg(i32 6145, i32 2)
|
|
||||||
br label %"87"
|
|
||||||
|
|
||||||
"87": ; preds = %"89"
|
|
||||||
store float %"79", ptr addrspace(5) %"130", align 4
|
|
||||||
store float %"80", ptr addrspace(5) %"131", align 4
|
|
||||||
%"134" = load float, ptr addrspace(5) %"130", align 4
|
|
||||||
store float %"134", ptr addrspace(5) %"132", align 4
|
|
||||||
%"135" = load float, ptr addrspace(5) %"131", align 4
|
|
||||||
store float %"135", ptr addrspace(5) %"133", align 4
|
|
||||||
%"137" = load float, ptr addrspace(5) %"132", align 4
|
|
||||||
%"138" = load float, ptr addrspace(5) %"133", align 4
|
|
||||||
%"136" = fadd float %"137", %"138"
|
|
||||||
store float %"136", ptr addrspace(5) %"132", align 4
|
|
||||||
%"139" = load float, ptr addrspace(5) %"132", align 4
|
|
||||||
store float %"139", ptr addrspace(5) %"129", align 4
|
|
||||||
%"140" = load float, ptr addrspace(5) %"129", align 4
|
|
||||||
store float %"140", ptr addrspace(5) %"128", align 4
|
|
||||||
%2 = load float, ptr addrspace(5) %"128", align 4
|
|
||||||
ret float %2
|
|
||||||
}
|
|
||||||
|
|
||||||
define float @add_rp(float %"82", float %"83") #0 {
|
|
||||||
%"141" = alloca float, align 4, addrspace(5)
|
|
||||||
%"142" = alloca float, align 4, addrspace(5)
|
|
||||||
%"143" = alloca float, align 4, addrspace(5)
|
|
||||||
%"144" = alloca float, align 4, addrspace(5)
|
|
||||||
%"145" = alloca float, align 4, addrspace(5)
|
|
||||||
%"146" = alloca float, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"88"
|
|
||||||
|
|
||||||
"88": ; preds = %1
|
|
||||||
store float %"82", ptr addrspace(5) %"143", align 4
|
|
||||||
store float %"83", ptr addrspace(5) %"144", align 4
|
|
||||||
%"147" = load float, ptr addrspace(5) %"143", align 4
|
|
||||||
store float %"147", ptr addrspace(5) %"145", align 4
|
|
||||||
%"148" = load float, ptr addrspace(5) %"144", align 4
|
|
||||||
store float %"148", ptr addrspace(5) %"146", align 4
|
|
||||||
%"150" = load float, ptr addrspace(5) %"145", align 4
|
|
||||||
%"151" = load float, ptr addrspace(5) %"146", align 4
|
|
||||||
%"149" = fadd float %"150", %"151"
|
|
||||||
store float %"149", ptr addrspace(5) %"145", align 4
|
|
||||||
%"152" = load float, ptr addrspace(5) %"145", align 4
|
|
||||||
store float %"152", ptr addrspace(5) %"142", align 4
|
|
||||||
%"153" = load float, ptr addrspace(5) %"142", align 4
|
|
||||||
store float %"153", ptr addrspace(5) %"141", align 4
|
|
||||||
%2 = load float, ptr addrspace(5) %"141", align 4
|
|
||||||
ret float %2
|
|
||||||
}
|
|
||||||
|
|
||||||
define amdgpu_kernel void @call_rnd(ptr addrspace(4) byref(i64) %"92", ptr addrspace(4) byref(i64) %"93") #1 {
|
|
||||||
%"94" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"95" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"96" = alloca float, align 4, addrspace(5)
|
|
||||||
%"97" = alloca float, align 4, addrspace(5)
|
|
||||||
%"98" = alloca float, align 4, addrspace(5)
|
|
||||||
%"99" = alloca float, align 4, addrspace(5)
|
|
||||||
%"100" = alloca float, align 4, addrspace(5)
|
|
||||||
%"101" = alloca float, align 4, addrspace(5)
|
|
||||||
%"102" = alloca float, align 4, addrspace(5)
|
|
||||||
%"103" = alloca float, align 4, addrspace(5)
|
|
||||||
%"104" = alloca float, align 4, addrspace(5)
|
|
||||||
%"105" = alloca float, align 4, addrspace(5)
|
|
||||||
%"106" = alloca float, align 4, addrspace(5)
|
|
||||||
%"107" = alloca float, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"84"
|
|
||||||
|
|
||||||
"84": ; preds = %1
|
|
||||||
call void @llvm.amdgcn.s.setreg(i32 6145, i32 1)
|
|
||||||
%"108" = load i64, ptr addrspace(4) %"92", align 4
|
|
||||||
store i64 %"108", ptr addrspace(5) %"94", align 4
|
|
||||||
%"109" = load i64, ptr addrspace(4) %"93", align 4
|
|
||||||
store i64 %"109", ptr addrspace(5) %"95", align 4
|
|
||||||
%"111" = load i64, ptr addrspace(5) %"94", align 4
|
|
||||||
%"154" = inttoptr i64 %"111" to ptr
|
|
||||||
%"110" = load float, ptr %"154", align 4
|
|
||||||
store float %"110", ptr addrspace(5) %"96", align 4
|
|
||||||
%"112" = load i64, ptr addrspace(5) %"94", align 4
|
|
||||||
%"155" = inttoptr i64 %"112" to ptr
|
|
||||||
%"59" = getelementptr inbounds i8, ptr %"155", i64 4
|
|
||||||
%"113" = load float, ptr %"59", align 4
|
|
||||||
store float %"113", ptr addrspace(5) %"97", align 4
|
|
||||||
%"114" = load i64, ptr addrspace(5) %"94", align 4
|
|
||||||
%"156" = inttoptr i64 %"114" to ptr
|
|
||||||
%"61" = getelementptr inbounds i8, ptr %"156", i64 8
|
|
||||||
%"115" = load float, ptr %"61", align 4
|
|
||||||
store float %"115", ptr addrspace(5) %"98", align 4
|
|
||||||
%"116" = load i64, ptr addrspace(5) %"94", align 4
|
|
||||||
%"157" = inttoptr i64 %"116" to ptr
|
|
||||||
%"63" = getelementptr inbounds i8, ptr %"157", i64 12
|
|
||||||
%"117" = load float, ptr %"63", align 4
|
|
||||||
store float %"117", ptr addrspace(5) %"99", align 4
|
|
||||||
%"118" = load float, ptr addrspace(5) %"96", align 4
|
|
||||||
store float %"118", ptr addrspace(5) %"102", align 4
|
|
||||||
%"119" = load float, ptr addrspace(5) %"97", align 4
|
|
||||||
store float %"119", ptr addrspace(5) %"103", align 4
|
|
||||||
%"72" = load float, ptr addrspace(5) %"102", align 4
|
|
||||||
%"73" = load float, ptr addrspace(5) %"103", align 4
|
|
||||||
%"74" = call float @add_rp(float %"72", float %"73")
|
|
||||||
br label %"85"
|
|
||||||
|
|
||||||
"85": ; preds = %"84"
|
|
||||||
store float %"74", ptr addrspace(5) %"104", align 4
|
|
||||||
%"120" = load float, ptr addrspace(5) %"104", align 4
|
|
||||||
store float %"120", ptr addrspace(5) %"100", align 4
|
|
||||||
%"121" = load i64, ptr addrspace(5) %"95", align 4
|
|
||||||
%"122" = load float, ptr addrspace(5) %"100", align 4
|
|
||||||
%"158" = inttoptr i64 %"121" to ptr
|
|
||||||
store float %"122", ptr %"158", align 4
|
|
||||||
%"123" = load float, ptr addrspace(5) %"98", align 4
|
|
||||||
store float %"123", ptr addrspace(5) %"105", align 4
|
|
||||||
%"124" = load float, ptr addrspace(5) %"99", align 4
|
|
||||||
store float %"124", ptr addrspace(5) %"106", align 4
|
|
||||||
%"75" = load float, ptr addrspace(5) %"105", align 4
|
|
||||||
%"76" = load float, ptr addrspace(5) %"106", align 4
|
|
||||||
%"77" = call float @add_rm(float %"75", float %"76")
|
|
||||||
br label %"86"
|
|
||||||
|
|
||||||
"86": ; preds = %"85"
|
|
||||||
store float %"77", ptr addrspace(5) %"107", align 4
|
|
||||||
%"125" = load float, ptr addrspace(5) %"107", align 4
|
|
||||||
store float %"125", ptr addrspace(5) %"101", align 4
|
|
||||||
%"126" = load i64, ptr addrspace(5) %"95", align 4
|
|
||||||
%"159" = inttoptr i64 %"126" to ptr
|
|
||||||
%"65" = getelementptr inbounds i8, ptr %"159", i64 4
|
|
||||||
%"127" = load float, ptr addrspace(5) %"101", align 4
|
|
||||||
store float %"127", ptr %"65", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind willreturn
|
|
||||||
declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #2
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #2 = { nocallback nofree nosync nounwind willreturn }
|
|
@ -1,33 +0,0 @@
|
|||||||
define amdgpu_kernel void @clz(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
|
||||||
%"32" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"29"
|
|
||||||
|
|
||||||
"29": ; preds = %1
|
|
||||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
|
||||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
|
||||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
|
||||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
|
||||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
|
||||||
%"43" = inttoptr i64 %"38" to ptr
|
|
||||||
%"37" = load i32, ptr %"43", align 4
|
|
||||||
store i32 %"37", ptr addrspace(5) %"34", align 4
|
|
||||||
%"40" = load i32, ptr addrspace(5) %"34", align 4
|
|
||||||
%"44" = call i32 @llvm.ctlz.i32(i32 %"40", i1 false)
|
|
||||||
store i32 %"44", ptr addrspace(5) %"34", align 4
|
|
||||||
%"41" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"42" = load i32, ptr addrspace(5) %"34", align 4
|
|
||||||
%"45" = inttoptr i64 %"41" to ptr
|
|
||||||
store i32 %"42", ptr %"45", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
||||||
declare i32 @llvm.ctlz.i32(i32, i1 immarg) #1
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
|
@ -1,50 +0,0 @@
|
|||||||
@constparams = addrspace(4) global [4 x i16] [i16 10, i16 20, i16 30, i16 40], align 8
|
|
||||||
|
|
||||||
define amdgpu_kernel void @const(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #0 {
|
|
||||||
%"48" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"49" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"50" = alloca i16, align 2, addrspace(5)
|
|
||||||
%"51" = alloca i16, align 2, addrspace(5)
|
|
||||||
%"52" = alloca i16, align 2, addrspace(5)
|
|
||||||
%"53" = alloca i16, align 2, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"45"
|
|
||||||
|
|
||||||
"45": ; preds = %1
|
|
||||||
%"54" = load i64, ptr addrspace(4) %"46", align 4
|
|
||||||
store i64 %"54", ptr addrspace(5) %"48", align 4
|
|
||||||
%"55" = load i64, ptr addrspace(4) %"47", align 4
|
|
||||||
store i64 %"55", ptr addrspace(5) %"49", align 4
|
|
||||||
%"56" = load i16, ptr addrspace(4) @constparams, align 2
|
|
||||||
store i16 %"56", ptr addrspace(5) %"50", align 2
|
|
||||||
%"57" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 2), align 2
|
|
||||||
store i16 %"57", ptr addrspace(5) %"51", align 2
|
|
||||||
%"58" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 4), align 2
|
|
||||||
store i16 %"58", ptr addrspace(5) %"52", align 2
|
|
||||||
%"59" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 6), align 2
|
|
||||||
store i16 %"59", ptr addrspace(5) %"53", align 2
|
|
||||||
%"60" = load i64, ptr addrspace(5) %"49", align 4
|
|
||||||
%"61" = load i16, ptr addrspace(5) %"50", align 2
|
|
||||||
%"72" = inttoptr i64 %"60" to ptr
|
|
||||||
store i16 %"61", ptr %"72", align 2
|
|
||||||
%"62" = load i64, ptr addrspace(5) %"49", align 4
|
|
||||||
%"74" = inttoptr i64 %"62" to ptr
|
|
||||||
%"40" = getelementptr inbounds i8, ptr %"74", i64 2
|
|
||||||
%"63" = load i16, ptr addrspace(5) %"51", align 2
|
|
||||||
store i16 %"63", ptr %"40", align 2
|
|
||||||
%"64" = load i64, ptr addrspace(5) %"49", align 4
|
|
||||||
%"76" = inttoptr i64 %"64" to ptr
|
|
||||||
%"42" = getelementptr inbounds i8, ptr %"76", i64 4
|
|
||||||
%"65" = load i16, ptr addrspace(5) %"52", align 2
|
|
||||||
store i16 %"65", ptr %"42", align 2
|
|
||||||
%"66" = load i64, ptr addrspace(5) %"49", align 4
|
|
||||||
%"78" = inttoptr i64 %"66" to ptr
|
|
||||||
%"44" = getelementptr inbounds i8, ptr %"78", i64 6
|
|
||||||
%"67" = load i16, ptr addrspace(5) %"53", align 2
|
|
||||||
store i16 %"67", ptr %"44", align 2
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,29 +0,0 @@
|
|||||||
define amdgpu_kernel void @constant_f32(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"35" = alloca float, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"30"
|
|
||||||
|
|
||||||
"30": ; preds = %1
|
|
||||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
|
||||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
|
||||||
%"37" = load i64, ptr addrspace(4) %"32", align 4
|
|
||||||
store i64 %"37", ptr addrspace(5) %"34", align 4
|
|
||||||
%"39" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"44" = inttoptr i64 %"39" to ptr
|
|
||||||
%"38" = load float, ptr %"44", align 4
|
|
||||||
store float %"38", ptr addrspace(5) %"35", align 4
|
|
||||||
%"41" = load float, ptr addrspace(5) %"35", align 4
|
|
||||||
%"40" = fmul float %"41", 5.000000e-01
|
|
||||||
store float %"40", ptr addrspace(5) %"35", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(5) %"34", align 4
|
|
||||||
%"43" = load float, ptr addrspace(5) %"35", align 4
|
|
||||||
%"45" = inttoptr i64 %"42" to ptr
|
|
||||||
store float %"43", ptr %"45", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,29 +0,0 @@
|
|||||||
define amdgpu_kernel void @constant_negative(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"35" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"30"
|
|
||||||
|
|
||||||
"30": ; preds = %1
|
|
||||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
|
||||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
|
||||||
%"37" = load i64, ptr addrspace(4) %"32", align 4
|
|
||||||
store i64 %"37", ptr addrspace(5) %"34", align 4
|
|
||||||
%"39" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"44" = inttoptr i64 %"39" to ptr
|
|
||||||
%"38" = load i32, ptr %"44", align 4
|
|
||||||
store i32 %"38", ptr addrspace(5) %"35", align 4
|
|
||||||
%"41" = load i32, ptr addrspace(5) %"35", align 4
|
|
||||||
%"40" = mul i32 %"41", -1
|
|
||||||
store i32 %"40", ptr addrspace(5) %"35", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(5) %"34", align 4
|
|
||||||
%"43" = load i32, ptr addrspace(5) %"35", align 4
|
|
||||||
%"45" = inttoptr i64 %"42" to ptr
|
|
||||||
store i32 %"43", ptr %"45", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,33 +0,0 @@
|
|||||||
define amdgpu_kernel void @cos(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
|
||||||
%"32" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca float, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"29"
|
|
||||||
|
|
||||||
"29": ; preds = %1
|
|
||||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
|
||||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
|
||||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
|
||||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
|
||||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
|
||||||
%"43" = inttoptr i64 %"38" to ptr
|
|
||||||
%"37" = load float, ptr %"43", align 4
|
|
||||||
store float %"37", ptr addrspace(5) %"34", align 4
|
|
||||||
%"40" = load float, ptr addrspace(5) %"34", align 4
|
|
||||||
%"39" = call afn float @llvm.cos.f32(float %"40")
|
|
||||||
store float %"39", ptr addrspace(5) %"34", align 4
|
|
||||||
%"41" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"42" = load float, ptr addrspace(5) %"34", align 4
|
|
||||||
%"44" = inttoptr i64 %"41" to ptr
|
|
||||||
store float %"42", ptr %"44", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
||||||
declare float @llvm.cos.f32(float) #1
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
|
@ -1,30 +0,0 @@
|
|||||||
define amdgpu_kernel void @cvt_f64_f32(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"35" = alloca float, align 4, addrspace(5)
|
|
||||||
%"36" = alloca double, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"30"
|
|
||||||
|
|
||||||
"30": ; preds = %1
|
|
||||||
%"37" = load i64, ptr addrspace(4) %"31", align 4
|
|
||||||
store i64 %"37", ptr addrspace(5) %"33", align 4
|
|
||||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
|
||||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"45" = inttoptr i64 %"40" to ptr addrspace(1)
|
|
||||||
%"39" = load float, ptr addrspace(1) %"45", align 4
|
|
||||||
store float %"39", ptr addrspace(5) %"35", align 4
|
|
||||||
%"42" = load float, ptr addrspace(5) %"35", align 4
|
|
||||||
%"41" = fpext float %"42" to double
|
|
||||||
store double %"41", ptr addrspace(5) %"36", align 8
|
|
||||||
%"43" = load i64, ptr addrspace(5) %"34", align 4
|
|
||||||
%"44" = load double, ptr addrspace(5) %"36", align 8
|
|
||||||
%"46" = inttoptr i64 %"43" to ptr
|
|
||||||
store double %"44", ptr %"46", align 8
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,49 +0,0 @@
|
|||||||
define amdgpu_kernel void @cvt_rni(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
|
||||||
%"37" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"38" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"39" = alloca float, align 4, addrspace(5)
|
|
||||||
%"40" = alloca float, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"34"
|
|
||||||
|
|
||||||
"34": ; preds = %1
|
|
||||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
|
||||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
|
||||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
|
||||||
%"44" = load i64, ptr addrspace(5) %"37", align 4
|
|
||||||
%"55" = inttoptr i64 %"44" to ptr
|
|
||||||
%"43" = load float, ptr %"55", align 4
|
|
||||||
store float %"43", ptr addrspace(5) %"39", align 4
|
|
||||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
|
||||||
%"56" = inttoptr i64 %"45" to ptr
|
|
||||||
%"31" = getelementptr inbounds i8, ptr %"56", i64 4
|
|
||||||
%"46" = load float, ptr %"31", align 4
|
|
||||||
store float %"46", ptr addrspace(5) %"40", align 4
|
|
||||||
%"48" = load float, ptr addrspace(5) %"39", align 4
|
|
||||||
%2 = call float @llvm.roundeven.f32(float %"48")
|
|
||||||
%"47" = freeze float %2
|
|
||||||
store float %"47", ptr addrspace(5) %"39", align 4
|
|
||||||
%"50" = load float, ptr addrspace(5) %"40", align 4
|
|
||||||
%3 = call float @llvm.roundeven.f32(float %"50")
|
|
||||||
%"49" = freeze float %3
|
|
||||||
store float %"49", ptr addrspace(5) %"40", align 4
|
|
||||||
%"51" = load i64, ptr addrspace(5) %"38", align 4
|
|
||||||
%"52" = load float, ptr addrspace(5) %"39", align 4
|
|
||||||
%"57" = inttoptr i64 %"51" to ptr
|
|
||||||
store float %"52", ptr %"57", align 4
|
|
||||||
%"53" = load i64, ptr addrspace(5) %"38", align 4
|
|
||||||
%"58" = inttoptr i64 %"53" to ptr
|
|
||||||
%"33" = getelementptr inbounds i8, ptr %"58", i64 4
|
|
||||||
%"54" = load float, ptr addrspace(5) %"40", align 4
|
|
||||||
store float %"54", ptr %"33", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
||||||
declare float @llvm.roundeven.f32(float) #1
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
|
@ -1,54 +0,0 @@
|
|||||||
define amdgpu_kernel void @cvt_rzi(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
|
||||||
%"37" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"38" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"39" = alloca float, align 4, addrspace(5)
|
|
||||||
%"40" = alloca float, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"34"
|
|
||||||
|
|
||||||
"34": ; preds = %1
|
|
||||||
call void @llvm.amdgcn.s.setreg(i32 6145, i32 3)
|
|
||||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
|
||||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
|
||||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
|
||||||
%"44" = load i64, ptr addrspace(5) %"37", align 4
|
|
||||||
%"55" = inttoptr i64 %"44" to ptr
|
|
||||||
%"43" = load float, ptr %"55", align 4
|
|
||||||
store float %"43", ptr addrspace(5) %"39", align 4
|
|
||||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
|
||||||
%"56" = inttoptr i64 %"45" to ptr
|
|
||||||
%"31" = getelementptr inbounds i8, ptr %"56", i64 4
|
|
||||||
%"46" = load float, ptr %"31", align 4
|
|
||||||
store float %"46", ptr addrspace(5) %"40", align 4
|
|
||||||
%"48" = load float, ptr addrspace(5) %"39", align 4
|
|
||||||
%2 = call float @llvm.trunc.f32(float %"48")
|
|
||||||
%"47" = freeze float %2
|
|
||||||
store float %"47", ptr addrspace(5) %"39", align 4
|
|
||||||
%"50" = load float, ptr addrspace(5) %"40", align 4
|
|
||||||
%3 = call float @llvm.trunc.f32(float %"50")
|
|
||||||
%"49" = freeze float %3
|
|
||||||
store float %"49", ptr addrspace(5) %"40", align 4
|
|
||||||
%"51" = load i64, ptr addrspace(5) %"38", align 4
|
|
||||||
%"52" = load float, ptr addrspace(5) %"39", align 4
|
|
||||||
%"57" = inttoptr i64 %"51" to ptr
|
|
||||||
store float %"52", ptr %"57", align 4
|
|
||||||
%"53" = load i64, ptr addrspace(5) %"38", align 4
|
|
||||||
%"58" = inttoptr i64 %"53" to ptr
|
|
||||||
%"33" = getelementptr inbounds i8, ptr %"58", i64 4
|
|
||||||
%"54" = load float, ptr addrspace(5) %"40", align 4
|
|
||||||
store float %"54", ptr %"33", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind willreturn
|
|
||||||
declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #1
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
||||||
declare float @llvm.trunc.f32(float) #2
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { nocallback nofree nosync nounwind willreturn }
|
|
||||||
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
|
@ -1,32 +0,0 @@
|
|||||||
define amdgpu_kernel void @cvt_s16_s8(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"35" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"36" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"30"
|
|
||||||
|
|
||||||
"30": ; preds = %1
|
|
||||||
%"37" = load i64, ptr addrspace(4) %"31", align 4
|
|
||||||
store i64 %"37", ptr addrspace(5) %"33", align 4
|
|
||||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
|
||||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"45" = inttoptr i64 %"40" to ptr addrspace(1)
|
|
||||||
%"39" = load i32, ptr addrspace(1) %"45", align 4
|
|
||||||
store i32 %"39", ptr addrspace(5) %"36", align 4
|
|
||||||
%"42" = load i32, ptr addrspace(5) %"36", align 4
|
|
||||||
%2 = trunc i32 %"42" to i8
|
|
||||||
%"46" = sext i8 %2 to i16
|
|
||||||
%"41" = sext i16 %"46" to i32
|
|
||||||
store i32 %"41", ptr addrspace(5) %"35", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(5) %"34", align 4
|
|
||||||
%"44" = load i32, ptr addrspace(5) %"35", align 4
|
|
||||||
%"48" = inttoptr i64 %"43" to ptr
|
|
||||||
store i32 %"44", ptr %"48", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,55 +0,0 @@
|
|||||||
define amdgpu_kernel void @cvt_s32_f32(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
|
||||||
%"37" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"38" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"39" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"40" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"34"
|
|
||||||
|
|
||||||
"34": ; preds = %1
|
|
||||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
|
||||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
|
||||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
|
||||||
%"44" = load i64, ptr addrspace(5) %"37", align 4
|
|
||||||
%"56" = inttoptr i64 %"44" to ptr
|
|
||||||
%"55" = load float, ptr %"56", align 4
|
|
||||||
%"43" = bitcast float %"55" to i32
|
|
||||||
store i32 %"43", ptr addrspace(5) %"39", align 4
|
|
||||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
|
||||||
%"57" = inttoptr i64 %"45" to ptr
|
|
||||||
%"31" = getelementptr inbounds i8, ptr %"57", i64 4
|
|
||||||
%"58" = load float, ptr %"31", align 4
|
|
||||||
%"46" = bitcast float %"58" to i32
|
|
||||||
store i32 %"46", ptr addrspace(5) %"40", align 4
|
|
||||||
%"48" = load i32, ptr addrspace(5) %"39", align 4
|
|
||||||
%"60" = bitcast i32 %"48" to float
|
|
||||||
%2 = call float @llvm.ceil.f32(float %"60")
|
|
||||||
%3 = fptosi float %2 to i32
|
|
||||||
%"59" = freeze i32 %3
|
|
||||||
store i32 %"59", ptr addrspace(5) %"39", align 4
|
|
||||||
%"50" = load i32, ptr addrspace(5) %"40", align 4
|
|
||||||
%"62" = bitcast i32 %"50" to float
|
|
||||||
%4 = call float @llvm.ceil.f32(float %"62")
|
|
||||||
%5 = fptosi float %4 to i32
|
|
||||||
%"61" = freeze i32 %5
|
|
||||||
store i32 %"61", ptr addrspace(5) %"40", align 4
|
|
||||||
%"51" = load i64, ptr addrspace(5) %"38", align 4
|
|
||||||
%"52" = load i32, ptr addrspace(5) %"39", align 4
|
|
||||||
%"63" = inttoptr i64 %"51" to ptr addrspace(1)
|
|
||||||
store i32 %"52", ptr addrspace(1) %"63", align 4
|
|
||||||
%"53" = load i64, ptr addrspace(5) %"38", align 4
|
|
||||||
%"65" = inttoptr i64 %"53" to ptr addrspace(1)
|
|
||||||
%"33" = getelementptr inbounds i8, ptr addrspace(1) %"65", i64 4
|
|
||||||
%"54" = load i32, ptr addrspace(5) %"40", align 4
|
|
||||||
store i32 %"54", ptr addrspace(1) %"33", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
||||||
declare float @llvm.ceil.f32(float) #1
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
|
@ -1,30 +0,0 @@
|
|||||||
define amdgpu_kernel void @cvt_s64_s32(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"35" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"36" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"30"
|
|
||||||
|
|
||||||
"30": ; preds = %1
|
|
||||||
%"37" = load i64, ptr addrspace(4) %"31", align 4
|
|
||||||
store i64 %"37", ptr addrspace(5) %"33", align 4
|
|
||||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
|
||||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"46" = inttoptr i64 %"40" to ptr
|
|
||||||
%"45" = load i32, ptr %"46", align 4
|
|
||||||
store i32 %"45", ptr addrspace(5) %"35", align 4
|
|
||||||
%"42" = load i32, ptr addrspace(5) %"35", align 4
|
|
||||||
%"41" = sext i32 %"42" to i64
|
|
||||||
store i64 %"41", ptr addrspace(5) %"36", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(5) %"34", align 4
|
|
||||||
%"44" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
%"47" = inttoptr i64 %"43" to ptr
|
|
||||||
store i64 %"44", ptr %"47", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,41 +0,0 @@
|
|||||||
define amdgpu_kernel void @cvt_sat_s_u(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
|
|
||||||
%"34" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"35" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"36" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"37" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"38" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"31"
|
|
||||||
|
|
||||||
"31": ; preds = %1
|
|
||||||
%"39" = load i64, ptr addrspace(4) %"32", align 4
|
|
||||||
store i64 %"39", ptr addrspace(5) %"34", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(4) %"33", align 4
|
|
||||||
store i64 %"40", ptr addrspace(5) %"35", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(5) %"34", align 4
|
|
||||||
%"49" = inttoptr i64 %"42" to ptr
|
|
||||||
%"41" = load i32, ptr %"49", align 4
|
|
||||||
store i32 %"41", ptr addrspace(5) %"36", align 4
|
|
||||||
%"44" = load i32, ptr addrspace(5) %"36", align 4
|
|
||||||
%2 = call i32 @llvm.smax.i32(i32 %"44", i32 0)
|
|
||||||
%3 = call i32 @llvm.umin.i32(i32 %2, i32 -1)
|
|
||||||
store i32 %3, ptr addrspace(5) %"37", align 4
|
|
||||||
%"46" = load i32, ptr addrspace(5) %"37", align 4
|
|
||||||
store i32 %"46", ptr addrspace(5) %"38", align 4
|
|
||||||
%"47" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"48" = load i32, ptr addrspace(5) %"38", align 4
|
|
||||||
%"50" = inttoptr i64 %"47" to ptr
|
|
||||||
store i32 %"48", ptr %"50", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
||||||
declare i32 @llvm.smax.i32(i32, i32) #1
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
||||||
declare i32 @llvm.umin.i32(i32, i32) #1
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
|
@ -1,34 +0,0 @@
|
|||||||
define amdgpu_kernel void @cvta(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
|
||||||
%"32" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca float, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"29"
|
|
||||||
|
|
||||||
"29": ; preds = %1
|
|
||||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
|
||||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
|
||||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
|
||||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
|
||||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
|
||||||
%2 = inttoptr i64 %"38" to ptr
|
|
||||||
%"45" = addrspacecast ptr %2 to ptr addrspace(1)
|
|
||||||
store ptr addrspace(1) %"45", ptr addrspace(5) %"32", align 8
|
|
||||||
%"40" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%3 = inttoptr i64 %"40" to ptr
|
|
||||||
%"47" = addrspacecast ptr %3 to ptr addrspace(1)
|
|
||||||
store ptr addrspace(1) %"47", ptr addrspace(5) %"33", align 8
|
|
||||||
%"42" = load i64, ptr addrspace(5) %"32", align 4
|
|
||||||
%"49" = inttoptr i64 %"42" to ptr addrspace(1)
|
|
||||||
%"41" = load float, ptr addrspace(1) %"49", align 4
|
|
||||||
store float %"41", ptr addrspace(5) %"34", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"44" = load float, ptr addrspace(5) %"34", align 4
|
|
||||||
%"50" = inttoptr i64 %"43" to ptr addrspace(1)
|
|
||||||
store float %"44", ptr addrspace(1) %"50", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,36 +0,0 @@
|
|||||||
define amdgpu_kernel void @div_approx(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
|
|
||||||
%"35" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"36" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"37" = alloca float, align 4, addrspace(5)
|
|
||||||
%"38" = alloca float, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"32"
|
|
||||||
|
|
||||||
"32": ; preds = %1
|
|
||||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
|
||||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
|
||||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"50" = inttoptr i64 %"42" to ptr
|
|
||||||
%"41" = load float, ptr %"50", align 4
|
|
||||||
store float %"41", ptr addrspace(5) %"37", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"51" = inttoptr i64 %"43" to ptr
|
|
||||||
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
|
|
||||||
%"44" = load float, ptr %"31", align 4
|
|
||||||
store float %"44", ptr addrspace(5) %"38", align 4
|
|
||||||
%"46" = load float, ptr addrspace(5) %"37", align 4
|
|
||||||
%"47" = load float, ptr addrspace(5) %"38", align 4
|
|
||||||
%"45" = fdiv arcp afn float %"46", %"47"
|
|
||||||
store float %"45", ptr addrspace(5) %"37", align 4
|
|
||||||
%"48" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
%"49" = load float, ptr addrspace(5) %"37", align 4
|
|
||||||
%"52" = inttoptr i64 %"48" to ptr
|
|
||||||
store float %"49", ptr %"52", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,33 +0,0 @@
|
|||||||
define amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
|
||||||
%"32" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca float, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"29"
|
|
||||||
|
|
||||||
"29": ; preds = %1
|
|
||||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
|
||||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
|
||||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
|
||||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
|
||||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
|
||||||
%"43" = inttoptr i64 %"38" to ptr
|
|
||||||
%"37" = load float, ptr %"43", align 4
|
|
||||||
store float %"37", ptr addrspace(5) %"34", align 4
|
|
||||||
%"40" = load float, ptr addrspace(5) %"34", align 4
|
|
||||||
%"39" = call float @llvm.amdgcn.exp2.f32(float %"40")
|
|
||||||
store float %"39", ptr addrspace(5) %"34", align 4
|
|
||||||
%"41" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"42" = load float, ptr addrspace(5) %"34", align 4
|
|
||||||
%"44" = inttoptr i64 %"41" to ptr
|
|
||||||
store float %"42", ptr %"44", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
||||||
declare float @llvm.amdgcn.exp2.f32(float) #1
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
|
@ -1,32 +0,0 @@
|
|||||||
@shared_mem = external addrspace(3) global [0 x i32]
|
|
||||||
|
|
||||||
define amdgpu_kernel void @extern_shared(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"35" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"30"
|
|
||||||
|
|
||||||
"30": ; preds = %1
|
|
||||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
|
||||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
|
||||||
%"37" = load i64, ptr addrspace(4) %"32", align 4
|
|
||||||
store i64 %"37", ptr addrspace(5) %"34", align 4
|
|
||||||
%"39" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"44" = inttoptr i64 %"39" to ptr addrspace(1)
|
|
||||||
%"38" = load i64, ptr addrspace(1) %"44", align 4
|
|
||||||
store i64 %"38", ptr addrspace(5) %"35", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
store i64 %"40", ptr addrspace(3) @shared_mem, align 4
|
|
||||||
%"41" = load i64, ptr addrspace(3) @shared_mem, align 4
|
|
||||||
store i64 %"41", ptr addrspace(5) %"35", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(5) %"34", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"47" = inttoptr i64 %"42" to ptr addrspace(1)
|
|
||||||
store i64 %"43", ptr addrspace(1) %"47", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,55 +0,0 @@
|
|||||||
@shared_mem = external addrspace(3) global [0 x i32], align 4
|
|
||||||
|
|
||||||
define void @incr_shared_2_global() #0 {
|
|
||||||
%"36" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"33"
|
|
||||||
|
|
||||||
"33": ; preds = %1
|
|
||||||
%"37" = load i64, ptr addrspace(3) @shared_mem, align 4
|
|
||||||
store i64 %"37", ptr addrspace(5) %"36", align 4
|
|
||||||
%"39" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
%"38" = add i64 %"39", 2
|
|
||||||
store i64 %"38", ptr addrspace(5) %"36", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
store i64 %"40", ptr addrspace(3) @shared_mem, align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
define amdgpu_kernel void @extern_shared_call(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #1 {
|
|
||||||
%"43" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"44" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"45" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"34"
|
|
||||||
|
|
||||||
"34": ; preds = %1
|
|
||||||
%"46" = load i64, ptr addrspace(4) %"41", align 4
|
|
||||||
store i64 %"46", ptr addrspace(5) %"43", align 4
|
|
||||||
%"47" = load i64, ptr addrspace(4) %"42", align 4
|
|
||||||
store i64 %"47", ptr addrspace(5) %"44", align 4
|
|
||||||
%"49" = load i64, ptr addrspace(5) %"43", align 4
|
|
||||||
%"56" = inttoptr i64 %"49" to ptr addrspace(1)
|
|
||||||
%"48" = load i64, ptr addrspace(1) %"56", align 4
|
|
||||||
store i64 %"48", ptr addrspace(5) %"45", align 4
|
|
||||||
%"50" = load i64, ptr addrspace(5) %"45", align 4
|
|
||||||
store i64 %"50", ptr addrspace(3) @shared_mem, align 4
|
|
||||||
call void @incr_shared_2_global()
|
|
||||||
br label %"35"
|
|
||||||
|
|
||||||
"35": ; preds = %"34"
|
|
||||||
%"51" = load i64, ptr addrspace(3) @shared_mem, align 4
|
|
||||||
store i64 %"51", ptr addrspace(5) %"45", align 4
|
|
||||||
%"52" = load i64, ptr addrspace(5) %"44", align 4
|
|
||||||
%"53" = load i64, ptr addrspace(5) %"45", align 4
|
|
||||||
%"59" = inttoptr i64 %"52" to ptr addrspace(1)
|
|
||||||
store i64 %"53", ptr addrspace(1) %"59", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,47 +0,0 @@
|
|||||||
define amdgpu_kernel void @fma(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
|
|
||||||
%"38" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"39" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"40" = alloca float, align 4, addrspace(5)
|
|
||||||
%"41" = alloca float, align 4, addrspace(5)
|
|
||||||
%"42" = alloca float, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"35"
|
|
||||||
|
|
||||||
"35": ; preds = %1
|
|
||||||
%"43" = load i64, ptr addrspace(4) %"36", align 4
|
|
||||||
store i64 %"43", ptr addrspace(5) %"38", align 4
|
|
||||||
%"44" = load i64, ptr addrspace(4) %"37", align 4
|
|
||||||
store i64 %"44", ptr addrspace(5) %"39", align 4
|
|
||||||
%"46" = load i64, ptr addrspace(5) %"38", align 4
|
|
||||||
%"57" = inttoptr i64 %"46" to ptr
|
|
||||||
%"45" = load float, ptr %"57", align 4
|
|
||||||
store float %"45", ptr addrspace(5) %"40", align 4
|
|
||||||
%"47" = load i64, ptr addrspace(5) %"38", align 4
|
|
||||||
%"58" = inttoptr i64 %"47" to ptr
|
|
||||||
%"32" = getelementptr inbounds i8, ptr %"58", i64 4
|
|
||||||
%"48" = load float, ptr %"32", align 4
|
|
||||||
store float %"48", ptr addrspace(5) %"41", align 4
|
|
||||||
%"49" = load i64, ptr addrspace(5) %"38", align 4
|
|
||||||
%"59" = inttoptr i64 %"49" to ptr
|
|
||||||
%"34" = getelementptr inbounds i8, ptr %"59", i64 8
|
|
||||||
%"50" = load float, ptr %"34", align 4
|
|
||||||
store float %"50", ptr addrspace(5) %"42", align 4
|
|
||||||
%"52" = load float, ptr addrspace(5) %"40", align 4
|
|
||||||
%"53" = load float, ptr addrspace(5) %"41", align 4
|
|
||||||
%"54" = load float, ptr addrspace(5) %"42", align 4
|
|
||||||
%"51" = call float @llvm.fma.f32(float %"52", float %"53", float %"54")
|
|
||||||
store float %"51", ptr addrspace(5) %"40", align 4
|
|
||||||
%"55" = load i64, ptr addrspace(5) %"39", align 4
|
|
||||||
%"56" = load float, ptr addrspace(5) %"40", align 4
|
|
||||||
%"60" = inttoptr i64 %"55" to ptr
|
|
||||||
store float %"56", ptr %"60", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
||||||
declare float @llvm.fma.f32(float, float, float) #1
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
|
@ -1,27 +0,0 @@
|
|||||||
@foobar = addrspace(1) global [4 x i32] [i32 1, i32 0, i32 0, i32 0]
|
|
||||||
|
|
||||||
define amdgpu_kernel void @global_array(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"35" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"30"
|
|
||||||
|
|
||||||
"30": ; preds = %1
|
|
||||||
store i64 ptrtoint (ptr addrspace(1) @foobar to i64), ptr addrspace(5) %"33", align 4
|
|
||||||
%"37" = load i64, ptr addrspace(4) %"32", align 4
|
|
||||||
store i64 %"37", ptr addrspace(5) %"34", align 4
|
|
||||||
%"39" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"43" = inttoptr i64 %"39" to ptr addrspace(1)
|
|
||||||
%"38" = load i32, ptr addrspace(1) %"43", align 4
|
|
||||||
store i32 %"38", ptr addrspace(5) %"35", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(5) %"34", align 4
|
|
||||||
%"41" = load i32, ptr addrspace(5) %"35", align 4
|
|
||||||
%"44" = inttoptr i64 %"40" to ptr addrspace(1)
|
|
||||||
store i32 %"41", ptr addrspace(1) %"44", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,26 +0,0 @@
|
|||||||
define amdgpu_kernel void @ld_st(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
|
||||||
%"32" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"29"
|
|
||||||
|
|
||||||
"29": ; preds = %1
|
|
||||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
|
||||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
|
||||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
|
||||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
|
||||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
|
||||||
%"41" = inttoptr i64 %"38" to ptr
|
|
||||||
%"37" = load i64, ptr %"41", align 4
|
|
||||||
store i64 %"37", ptr addrspace(5) %"34", align 4
|
|
||||||
%"39" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(5) %"34", align 4
|
|
||||||
%"42" = inttoptr i64 %"39" to ptr
|
|
||||||
store i64 %"40", ptr %"42", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,31 +0,0 @@
|
|||||||
define amdgpu_kernel void @ld_st_implicit(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"35" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"30"
|
|
||||||
|
|
||||||
"30": ; preds = %1
|
|
||||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
|
||||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
|
||||||
%"37" = load i64, ptr addrspace(4) %"32", align 4
|
|
||||||
store i64 %"37", ptr addrspace(5) %"34", align 4
|
|
||||||
store i64 81985529216486895, ptr addrspace(5) %"35", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"44" = inttoptr i64 %"40" to ptr addrspace(1)
|
|
||||||
%"43" = load float, ptr addrspace(1) %"44", align 4
|
|
||||||
%2 = bitcast float %"43" to i32
|
|
||||||
%"39" = zext i32 %2 to i64
|
|
||||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
|
||||||
%"41" = load i64, ptr addrspace(5) %"34", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"45" = inttoptr i64 %"41" to ptr addrspace(1)
|
|
||||||
%3 = trunc i64 %"42" to i32
|
|
||||||
%"46" = bitcast i32 %3 to float
|
|
||||||
store float %"46", ptr addrspace(1) %"45", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,37 +0,0 @@
|
|||||||
define amdgpu_kernel void @ld_st_offset(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
|
||||||
%"37" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"38" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"39" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"40" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"34"
|
|
||||||
|
|
||||||
"34": ; preds = %1
|
|
||||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
|
||||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
|
||||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
|
||||||
%"44" = load i64, ptr addrspace(5) %"37", align 4
|
|
||||||
%"51" = inttoptr i64 %"44" to ptr
|
|
||||||
%"43" = load i32, ptr %"51", align 4
|
|
||||||
store i32 %"43", ptr addrspace(5) %"39", align 4
|
|
||||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
|
||||||
%"52" = inttoptr i64 %"45" to ptr
|
|
||||||
%"31" = getelementptr inbounds i8, ptr %"52", i64 4
|
|
||||||
%"46" = load i32, ptr %"31", align 4
|
|
||||||
store i32 %"46", ptr addrspace(5) %"40", align 4
|
|
||||||
%"47" = load i64, ptr addrspace(5) %"38", align 4
|
|
||||||
%"48" = load i32, ptr addrspace(5) %"40", align 4
|
|
||||||
%"53" = inttoptr i64 %"47" to ptr
|
|
||||||
store i32 %"48", ptr %"53", align 4
|
|
||||||
%"49" = load i64, ptr addrspace(5) %"38", align 4
|
|
||||||
%"54" = inttoptr i64 %"49" to ptr
|
|
||||||
%"33" = getelementptr inbounds i8, ptr %"54", i64 4
|
|
||||||
%"50" = load i32, ptr addrspace(5) %"39", align 4
|
|
||||||
store i32 %"50", ptr %"33", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,33 +0,0 @@
|
|||||||
define amdgpu_kernel void @lg2(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
|
||||||
%"32" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca float, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"29"
|
|
||||||
|
|
||||||
"29": ; preds = %1
|
|
||||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
|
||||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
|
||||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
|
||||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
|
||||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
|
||||||
%"43" = inttoptr i64 %"38" to ptr
|
|
||||||
%"37" = load float, ptr %"43", align 4
|
|
||||||
store float %"37", ptr addrspace(5) %"34", align 4
|
|
||||||
%"40" = load float, ptr addrspace(5) %"34", align 4
|
|
||||||
%"39" = call float @llvm.amdgcn.log.f32(float %"40")
|
|
||||||
store float %"39", ptr addrspace(5) %"34", align 4
|
|
||||||
%"41" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"42" = load float, ptr addrspace(5) %"34", align 4
|
|
||||||
%"44" = inttoptr i64 %"41" to ptr
|
|
||||||
store float %"42", ptr %"44", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
||||||
declare float @llvm.amdgcn.log.f32(float) #1
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
|
@ -1,27 +0,0 @@
|
|||||||
define amdgpu_kernel void @local_align(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
|
||||||
%"10" = alloca [8 x i8], align 8, addrspace(5)
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"35" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"30"
|
|
||||||
|
|
||||||
"30": ; preds = %1
|
|
||||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
|
||||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
|
||||||
%"37" = load i64, ptr addrspace(4) %"32", align 4
|
|
||||||
store i64 %"37", ptr addrspace(5) %"34", align 4
|
|
||||||
%"39" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"42" = inttoptr i64 %"39" to ptr
|
|
||||||
%"38" = load i64, ptr %"42", align 4
|
|
||||||
store i64 %"38", ptr addrspace(5) %"35", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(5) %"34", align 4
|
|
||||||
%"41" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"43" = inttoptr i64 %"40" to ptr
|
|
||||||
store i64 %"41", ptr %"43", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,55 +0,0 @@
|
|||||||
define amdgpu_kernel void @mad_s32(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 {
|
|
||||||
%"43" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"44" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"45" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"46" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"47" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"48" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"40"
|
|
||||||
|
|
||||||
"40": ; preds = %1
|
|
||||||
%"49" = load i64, ptr addrspace(4) %"41", align 4
|
|
||||||
store i64 %"49", ptr addrspace(5) %"43", align 4
|
|
||||||
%"50" = load i64, ptr addrspace(4) %"42", align 4
|
|
||||||
store i64 %"50", ptr addrspace(5) %"44", align 4
|
|
||||||
%"52" = load i64, ptr addrspace(5) %"43", align 4
|
|
||||||
%"67" = inttoptr i64 %"52" to ptr
|
|
||||||
%"51" = load i32, ptr %"67", align 4
|
|
||||||
store i32 %"51", ptr addrspace(5) %"46", align 4
|
|
||||||
%"53" = load i64, ptr addrspace(5) %"43", align 4
|
|
||||||
%"68" = inttoptr i64 %"53" to ptr
|
|
||||||
%"33" = getelementptr inbounds i8, ptr %"68", i64 4
|
|
||||||
%"54" = load i32, ptr %"33", align 4
|
|
||||||
store i32 %"54", ptr addrspace(5) %"47", align 4
|
|
||||||
%"55" = load i64, ptr addrspace(5) %"43", align 4
|
|
||||||
%"69" = inttoptr i64 %"55" to ptr
|
|
||||||
%"35" = getelementptr inbounds i8, ptr %"69", i64 8
|
|
||||||
%"56" = load i32, ptr %"35", align 4
|
|
||||||
store i32 %"56", ptr addrspace(5) %"48", align 4
|
|
||||||
%"58" = load i32, ptr addrspace(5) %"46", align 4
|
|
||||||
%"59" = load i32, ptr addrspace(5) %"47", align 4
|
|
||||||
%"60" = load i32, ptr addrspace(5) %"48", align 4
|
|
||||||
%2 = mul i32 %"58", %"59"
|
|
||||||
%"57" = add i32 %2, %"60"
|
|
||||||
store i32 %"57", ptr addrspace(5) %"45", align 4
|
|
||||||
%"61" = load i64, ptr addrspace(5) %"44", align 4
|
|
||||||
%"62" = load i32, ptr addrspace(5) %"45", align 4
|
|
||||||
%"70" = inttoptr i64 %"61" to ptr
|
|
||||||
store i32 %"62", ptr %"70", align 4
|
|
||||||
%"63" = load i64, ptr addrspace(5) %"44", align 4
|
|
||||||
%"71" = inttoptr i64 %"63" to ptr
|
|
||||||
%"37" = getelementptr inbounds i8, ptr %"71", i64 4
|
|
||||||
%"64" = load i32, ptr addrspace(5) %"45", align 4
|
|
||||||
store i32 %"64", ptr %"37", align 4
|
|
||||||
%"65" = load i64, ptr addrspace(5) %"44", align 4
|
|
||||||
%"72" = inttoptr i64 %"65" to ptr
|
|
||||||
%"39" = getelementptr inbounds i8, ptr %"72", i64 8
|
|
||||||
%"66" = load i32, ptr addrspace(5) %"45", align 4
|
|
||||||
store i32 %"66", ptr %"39", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,33 +0,0 @@
|
|||||||
define amdgpu_kernel void @malformed_label(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
|
|
||||||
%"36" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"37" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"38" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"39" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"32"
|
|
||||||
|
|
||||||
"32": ; preds = %1
|
|
||||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
|
||||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
|
||||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
|
||||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
|
||||||
br label %"10"
|
|
||||||
|
|
||||||
"10": ; preds = %"32"
|
|
||||||
%"43" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
%"48" = inttoptr i64 %"43" to ptr
|
|
||||||
%"42" = load i64, ptr %"48", align 4
|
|
||||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
|
||||||
%"45" = load i64, ptr addrspace(5) %"38", align 4
|
|
||||||
%"44" = add i64 %"45", 1
|
|
||||||
store i64 %"44", ptr addrspace(5) %"39", align 4
|
|
||||||
%"46" = load i64, ptr addrspace(5) %"37", align 4
|
|
||||||
%"47" = load i64, ptr addrspace(5) %"39", align 4
|
|
||||||
%"49" = inttoptr i64 %"46" to ptr
|
|
||||||
store i64 %"47", ptr %"49", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,40 +0,0 @@
|
|||||||
define amdgpu_kernel void @max(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
|
|
||||||
%"35" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"36" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"37" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"38" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"32"
|
|
||||||
|
|
||||||
"32": ; preds = %1
|
|
||||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
|
||||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
|
||||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"50" = inttoptr i64 %"42" to ptr
|
|
||||||
%"41" = load i32, ptr %"50", align 4
|
|
||||||
store i32 %"41", ptr addrspace(5) %"37", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"51" = inttoptr i64 %"43" to ptr
|
|
||||||
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
|
|
||||||
%"44" = load i32, ptr %"31", align 4
|
|
||||||
store i32 %"44", ptr addrspace(5) %"38", align 4
|
|
||||||
%"46" = load i32, ptr addrspace(5) %"37", align 4
|
|
||||||
%"47" = load i32, ptr addrspace(5) %"38", align 4
|
|
||||||
%"45" = call i32 @llvm.smax.i32(i32 %"46", i32 %"47")
|
|
||||||
store i32 %"45", ptr addrspace(5) %"37", align 4
|
|
||||||
%"48" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
%"49" = load i32, ptr addrspace(5) %"37", align 4
|
|
||||||
%"52" = inttoptr i64 %"48" to ptr
|
|
||||||
store i32 %"49", ptr %"52", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
||||||
declare i32 @llvm.smax.i32(i32, i32) #1
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
|
@ -1,27 +0,0 @@
|
|||||||
define amdgpu_kernel void @membar(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
|
||||||
%"32" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"29"
|
|
||||||
|
|
||||||
"29": ; preds = %1
|
|
||||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
|
||||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
|
||||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
|
||||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
|
||||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
|
||||||
%"42" = inttoptr i64 %"38" to ptr
|
|
||||||
%"41" = load i32, ptr %"42", align 4
|
|
||||||
store i32 %"41", ptr addrspace(5) %"34", align 4
|
|
||||||
fence seq_cst
|
|
||||||
%"39" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"40" = load i32, ptr addrspace(5) %"34", align 4
|
|
||||||
%"43" = inttoptr i64 %"39" to ptr
|
|
||||||
store i32 %"40", ptr %"43", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,40 +0,0 @@
|
|||||||
define amdgpu_kernel void @min(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
|
|
||||||
%"35" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"36" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"37" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"38" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"32"
|
|
||||||
|
|
||||||
"32": ; preds = %1
|
|
||||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
|
||||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
|
||||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"50" = inttoptr i64 %"42" to ptr
|
|
||||||
%"41" = load i32, ptr %"50", align 4
|
|
||||||
store i32 %"41", ptr addrspace(5) %"37", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"51" = inttoptr i64 %"43" to ptr
|
|
||||||
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
|
|
||||||
%"44" = load i32, ptr %"31", align 4
|
|
||||||
store i32 %"44", ptr addrspace(5) %"38", align 4
|
|
||||||
%"46" = load i32, ptr addrspace(5) %"37", align 4
|
|
||||||
%"47" = load i32, ptr addrspace(5) %"38", align 4
|
|
||||||
%"45" = call i32 @llvm.smin.i32(i32 %"46", i32 %"47")
|
|
||||||
store i32 %"45", ptr addrspace(5) %"37", align 4
|
|
||||||
%"48" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
%"49" = load i32, ptr addrspace(5) %"37", align 4
|
|
||||||
%"52" = inttoptr i64 %"48" to ptr
|
|
||||||
store i32 %"49", ptr %"52", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
||||||
declare i32 @llvm.smin.i32(i32, i32) #1
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
|
@ -1,29 +0,0 @@
|
|||||||
define amdgpu_kernel void @mov(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"35" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"36" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"30"
|
|
||||||
|
|
||||||
"30": ; preds = %1
|
|
||||||
%"37" = load i64, ptr addrspace(4) %"31", align 4
|
|
||||||
store i64 %"37", ptr addrspace(5) %"33", align 4
|
|
||||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
|
||||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"45" = inttoptr i64 %"40" to ptr
|
|
||||||
%"39" = load i64, ptr %"45", align 4
|
|
||||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
store i64 %"42", ptr addrspace(5) %"36", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(5) %"34", align 4
|
|
||||||
%"44" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
%"46" = inttoptr i64 %"43" to ptr
|
|
||||||
store i64 %"44", ptr %"46", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,15 +0,0 @@
|
|||||||
define amdgpu_kernel void @mov_address(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #0 {
|
|
||||||
%"10" = alloca [8 x i8], align 1, addrspace(5)
|
|
||||||
%"31" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"28"
|
|
||||||
|
|
||||||
"28": ; preds = %1
|
|
||||||
%"33" = ptrtoint ptr addrspace(5) %"10" to i64
|
|
||||||
store i64 %"33", ptr addrspace(5) %"31", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,46 +0,0 @@
|
|||||||
define amdgpu_kernel void @mul24_hi_s32(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
|
|
||||||
%"34" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"35" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"36" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"37" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"38" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"31"
|
|
||||||
|
|
||||||
"31": ; preds = %1
|
|
||||||
%"39" = load i64, ptr addrspace(4) %"32", align 4
|
|
||||||
store i64 %"39", ptr addrspace(5) %"34", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(4) %"33", align 4
|
|
||||||
store i64 %"40", ptr addrspace(5) %"35", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(5) %"34", align 4
|
|
||||||
%"50" = inttoptr i64 %"42" to ptr
|
|
||||||
%"41" = load i32, ptr %"50", align 4
|
|
||||||
store i32 %"41", ptr addrspace(5) %"36", align 4
|
|
||||||
%"44" = load i32, ptr addrspace(5) %"36", align 4
|
|
||||||
%"43" = sub i32 0, %"44"
|
|
||||||
store i32 %"43", ptr addrspace(5) %"37", align 4
|
|
||||||
%"46" = load i32, ptr addrspace(5) %"37", align 4
|
|
||||||
%"47" = load i32, ptr addrspace(5) %"36", align 4
|
|
||||||
%2 = call i32 @llvm.amdgcn.mul.i24(i32 %"46", i32 %"47")
|
|
||||||
%3 = call i32 @llvm.amdgcn.mulhi.i24(i32 %"46", i32 %"47")
|
|
||||||
%4 = lshr i32 %2, 16
|
|
||||||
%5 = shl i32 %3, 16
|
|
||||||
%"45" = or i32 %4, %5
|
|
||||||
store i32 %"45", ptr addrspace(5) %"38", align 4
|
|
||||||
%"48" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"49" = load i32, ptr addrspace(5) %"38", align 4
|
|
||||||
%"51" = inttoptr i64 %"48" to ptr
|
|
||||||
store i32 %"49", ptr %"51", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
||||||
declare i32 @llvm.amdgcn.mul.i24(i32, i32) #1
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
||||||
declare i32 @llvm.amdgcn.mulhi.i24(i32, i32) #1
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
|
@ -1,42 +0,0 @@
|
|||||||
define amdgpu_kernel void @mul24_hi_u32(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"35" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"36" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"30"
|
|
||||||
|
|
||||||
"30": ; preds = %1
|
|
||||||
%"37" = load i64, ptr addrspace(4) %"31", align 4
|
|
||||||
store i64 %"37", ptr addrspace(5) %"33", align 4
|
|
||||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
|
||||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"46" = inttoptr i64 %"40" to ptr
|
|
||||||
%"39" = load i32, ptr %"46", align 4
|
|
||||||
store i32 %"39", ptr addrspace(5) %"35", align 4
|
|
||||||
%"42" = load i32, ptr addrspace(5) %"35", align 4
|
|
||||||
%"43" = load i32, ptr addrspace(5) %"35", align 4
|
|
||||||
%2 = call i32 @llvm.amdgcn.mul.u24(i32 %"42", i32 %"43")
|
|
||||||
%3 = call i32 @llvm.amdgcn.mulhi.u24(i32 %"42", i32 %"43")
|
|
||||||
%4 = lshr i32 %2, 16
|
|
||||||
%5 = shl i32 %3, 16
|
|
||||||
%"41" = or i32 %4, %5
|
|
||||||
store i32 %"41", ptr addrspace(5) %"36", align 4
|
|
||||||
%"44" = load i64, ptr addrspace(5) %"34", align 4
|
|
||||||
%"45" = load i32, ptr addrspace(5) %"36", align 4
|
|
||||||
%"47" = inttoptr i64 %"44" to ptr
|
|
||||||
store i32 %"45", ptr %"47", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
||||||
declare i32 @llvm.amdgcn.mul.u24(i32, i32) #1
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
||||||
declare i32 @llvm.amdgcn.mulhi.u24(i32, i32) #1
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
|
@ -1,39 +0,0 @@
|
|||||||
define amdgpu_kernel void @mul24_lo_s32(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
|
|
||||||
%"34" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"35" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"36" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"37" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"38" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"31"
|
|
||||||
|
|
||||||
"31": ; preds = %1
|
|
||||||
%"39" = load i64, ptr addrspace(4) %"32", align 4
|
|
||||||
store i64 %"39", ptr addrspace(5) %"34", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(4) %"33", align 4
|
|
||||||
store i64 %"40", ptr addrspace(5) %"35", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(5) %"34", align 4
|
|
||||||
%"50" = inttoptr i64 %"42" to ptr
|
|
||||||
%"41" = load i32, ptr %"50", align 4
|
|
||||||
store i32 %"41", ptr addrspace(5) %"36", align 4
|
|
||||||
%"44" = load i32, ptr addrspace(5) %"36", align 4
|
|
||||||
%"43" = sub i32 0, %"44"
|
|
||||||
store i32 %"43", ptr addrspace(5) %"37", align 4
|
|
||||||
%"46" = load i32, ptr addrspace(5) %"37", align 4
|
|
||||||
%"47" = load i32, ptr addrspace(5) %"36", align 4
|
|
||||||
%"45" = call i32 @llvm.amdgcn.mul.i24(i32 %"46", i32 %"47")
|
|
||||||
store i32 %"45", ptr addrspace(5) %"38", align 4
|
|
||||||
%"48" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"49" = load i32, ptr addrspace(5) %"38", align 4
|
|
||||||
%"51" = inttoptr i64 %"48" to ptr
|
|
||||||
store i32 %"49", ptr %"51", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
||||||
declare i32 @llvm.amdgcn.mul.i24(i32, i32) #1
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
|
@ -1,35 +0,0 @@
|
|||||||
define amdgpu_kernel void @mul24_lo_u32(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"35" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"36" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"30"
|
|
||||||
|
|
||||||
"30": ; preds = %1
|
|
||||||
%"37" = load i64, ptr addrspace(4) %"31", align 4
|
|
||||||
store i64 %"37", ptr addrspace(5) %"33", align 4
|
|
||||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
|
||||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"46" = inttoptr i64 %"40" to ptr
|
|
||||||
%"39" = load i32, ptr %"46", align 4
|
|
||||||
store i32 %"39", ptr addrspace(5) %"35", align 4
|
|
||||||
%"42" = load i32, ptr addrspace(5) %"35", align 4
|
|
||||||
%"43" = load i32, ptr addrspace(5) %"35", align 4
|
|
||||||
%"41" = call i32 @llvm.amdgcn.mul.u24(i32 %"42", i32 %"43")
|
|
||||||
store i32 %"41", ptr addrspace(5) %"36", align 4
|
|
||||||
%"44" = load i64, ptr addrspace(5) %"34", align 4
|
|
||||||
%"45" = load i32, ptr addrspace(5) %"36", align 4
|
|
||||||
%"47" = inttoptr i64 %"44" to ptr
|
|
||||||
store i32 %"45", ptr %"47", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
||||||
declare i32 @llvm.amdgcn.mul.u24(i32, i32) #1
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
|
@ -1,36 +0,0 @@
|
|||||||
define amdgpu_kernel void @mul_ftz(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
|
|
||||||
%"35" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"36" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"37" = alloca float, align 4, addrspace(5)
|
|
||||||
%"38" = alloca float, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"32"
|
|
||||||
|
|
||||||
"32": ; preds = %1
|
|
||||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
|
||||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
|
||||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"50" = inttoptr i64 %"42" to ptr
|
|
||||||
%"41" = load float, ptr %"50", align 4
|
|
||||||
store float %"41", ptr addrspace(5) %"37", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"51" = inttoptr i64 %"43" to ptr
|
|
||||||
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
|
|
||||||
%"44" = load float, ptr %"31", align 4
|
|
||||||
store float %"44", ptr addrspace(5) %"38", align 4
|
|
||||||
%"46" = load float, ptr addrspace(5) %"37", align 4
|
|
||||||
%"47" = load float, ptr addrspace(5) %"38", align 4
|
|
||||||
%"45" = fmul float %"46", %"47"
|
|
||||||
store float %"45", ptr addrspace(5) %"37", align 4
|
|
||||||
%"48" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
%"49" = load float, ptr addrspace(5) %"37", align 4
|
|
||||||
%"52" = inttoptr i64 %"48" to ptr
|
|
||||||
store float %"49", ptr %"52", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,33 +0,0 @@
|
|||||||
define amdgpu_kernel void @mul_hi(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
|
|
||||||
%"34" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"35" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"36" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"37" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"31"
|
|
||||||
|
|
||||||
"31": ; preds = %1
|
|
||||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
|
||||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
|
||||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
|
||||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
|
||||||
%"41" = load i64, ptr addrspace(5) %"34", align 4
|
|
||||||
%"46" = inttoptr i64 %"41" to ptr
|
|
||||||
%"40" = load i64, ptr %"46", align 4
|
|
||||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
%2 = zext i64 %"43" to i128
|
|
||||||
%3 = mul i128 %2, 2
|
|
||||||
%4 = lshr i128 %3, 64
|
|
||||||
%"42" = trunc i128 %4 to i64
|
|
||||||
store i64 %"42", ptr addrspace(5) %"37", align 4
|
|
||||||
%"44" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
|
||||||
%"47" = inttoptr i64 %"44" to ptr
|
|
||||||
store i64 %"45", ptr %"47", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,30 +0,0 @@
|
|||||||
define amdgpu_kernel void @mul_lo(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
|
|
||||||
%"34" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"35" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"36" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"37" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"31"
|
|
||||||
|
|
||||||
"31": ; preds = %1
|
|
||||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
|
||||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
|
||||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
|
||||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
|
||||||
%"41" = load i64, ptr addrspace(5) %"34", align 4
|
|
||||||
%"46" = inttoptr i64 %"41" to ptr
|
|
||||||
%"40" = load i64, ptr %"46", align 4
|
|
||||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
%"42" = mul i64 %"43", 2
|
|
||||||
store i64 %"42", ptr addrspace(5) %"37", align 4
|
|
||||||
%"44" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
|
||||||
%"47" = inttoptr i64 %"44" to ptr
|
|
||||||
store i64 %"45", ptr %"47", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,36 +0,0 @@
|
|||||||
define amdgpu_kernel void @mul_non_ftz(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
|
|
||||||
%"35" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"36" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"37" = alloca float, align 4, addrspace(5)
|
|
||||||
%"38" = alloca float, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"32"
|
|
||||||
|
|
||||||
"32": ; preds = %1
|
|
||||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
|
||||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
|
||||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"50" = inttoptr i64 %"42" to ptr
|
|
||||||
%"41" = load float, ptr %"50", align 4
|
|
||||||
store float %"41", ptr addrspace(5) %"37", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"51" = inttoptr i64 %"43" to ptr
|
|
||||||
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
|
|
||||||
%"44" = load float, ptr %"31", align 4
|
|
||||||
store float %"44", ptr addrspace(5) %"38", align 4
|
|
||||||
%"46" = load float, ptr addrspace(5) %"37", align 4
|
|
||||||
%"47" = load float, ptr addrspace(5) %"38", align 4
|
|
||||||
%"45" = fmul float %"46", %"47"
|
|
||||||
store float %"45", ptr addrspace(5) %"37", align 4
|
|
||||||
%"48" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
%"49" = load float, ptr addrspace(5) %"37", align 4
|
|
||||||
%"52" = inttoptr i64 %"48" to ptr
|
|
||||||
store float %"49", ptr %"52", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,39 +0,0 @@
|
|||||||
define amdgpu_kernel void @mul_wide(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
|
|
||||||
%"36" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"37" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"38" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"39" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"40" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"33"
|
|
||||||
|
|
||||||
"33": ; preds = %1
|
|
||||||
%"41" = load i64, ptr addrspace(4) %"34", align 4
|
|
||||||
store i64 %"41", ptr addrspace(5) %"36", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(4) %"35", align 4
|
|
||||||
store i64 %"42", ptr addrspace(5) %"37", align 4
|
|
||||||
%"44" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
%"52" = inttoptr i64 %"44" to ptr addrspace(1)
|
|
||||||
%"43" = load i32, ptr addrspace(1) %"52", align 4
|
|
||||||
store i32 %"43", ptr addrspace(5) %"38", align 4
|
|
||||||
%"45" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
%"53" = inttoptr i64 %"45" to ptr addrspace(1)
|
|
||||||
%"32" = getelementptr inbounds i8, ptr addrspace(1) %"53", i64 4
|
|
||||||
%"46" = load i32, ptr addrspace(1) %"32", align 4
|
|
||||||
store i32 %"46", ptr addrspace(5) %"39", align 4
|
|
||||||
%"48" = load i32, ptr addrspace(5) %"38", align 4
|
|
||||||
%"49" = load i32, ptr addrspace(5) %"39", align 4
|
|
||||||
%2 = sext i32 %"48" to i64
|
|
||||||
%3 = sext i32 %"49" to i64
|
|
||||||
%"47" = mul i64 %2, %3
|
|
||||||
store i64 %"47", ptr addrspace(5) %"40", align 4
|
|
||||||
%"50" = load i64, ptr addrspace(5) %"37", align 4
|
|
||||||
%"51" = load i64, ptr addrspace(5) %"40", align 4
|
|
||||||
%"54" = inttoptr i64 %"50" to ptr
|
|
||||||
store i64 %"51", ptr %"54", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,29 +0,0 @@
|
|||||||
define amdgpu_kernel void @neg(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
|
||||||
%"32" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"29"
|
|
||||||
|
|
||||||
"29": ; preds = %1
|
|
||||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
|
||||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
|
||||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
|
||||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
|
||||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
|
||||||
%"43" = inttoptr i64 %"38" to ptr
|
|
||||||
%"37" = load i32, ptr %"43", align 4
|
|
||||||
store i32 %"37", ptr addrspace(5) %"34", align 4
|
|
||||||
%"40" = load i32, ptr addrspace(5) %"34", align 4
|
|
||||||
%"39" = sub i32 0, %"40"
|
|
||||||
store i32 %"39", ptr addrspace(5) %"34", align 4
|
|
||||||
%"41" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"42" = load i32, ptr addrspace(5) %"34", align 4
|
|
||||||
%"44" = inttoptr i64 %"41" to ptr
|
|
||||||
store i32 %"42", ptr %"44", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,35 +0,0 @@
|
|||||||
define amdgpu_kernel void @non_scalar_ptr_offset(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
|
|
||||||
%"36" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"37" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"38" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"39" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"33"
|
|
||||||
|
|
||||||
"33": ; preds = %1
|
|
||||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
|
||||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
|
||||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
|
||||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
%"50" = inttoptr i64 %"42" to ptr addrspace(1)
|
|
||||||
%"32" = getelementptr inbounds i8, ptr addrspace(1) %"50", i64 8
|
|
||||||
%"30" = load <2 x i32>, ptr addrspace(1) %"32", align 8
|
|
||||||
%"43" = extractelement <2 x i32> %"30", i8 0
|
|
||||||
%"44" = extractelement <2 x i32> %"30", i8 1
|
|
||||||
store i32 %"43", ptr addrspace(5) %"38", align 4
|
|
||||||
store i32 %"44", ptr addrspace(5) %"39", align 4
|
|
||||||
%"46" = load i32, ptr addrspace(5) %"38", align 4
|
|
||||||
%"47" = load i32, ptr addrspace(5) %"39", align 4
|
|
||||||
%"45" = add i32 %"46", %"47"
|
|
||||||
store i32 %"45", ptr addrspace(5) %"38", align 4
|
|
||||||
%"48" = load i64, ptr addrspace(5) %"37", align 4
|
|
||||||
%"49" = load i32, ptr addrspace(5) %"38", align 4
|
|
||||||
%"51" = inttoptr i64 %"48" to ptr addrspace(1)
|
|
||||||
store i32 %"49", ptr addrspace(1) %"51", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,30 +0,0 @@
|
|||||||
define amdgpu_kernel void @not(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"35" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"36" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"30"
|
|
||||||
|
|
||||||
"30": ; preds = %1
|
|
||||||
%"37" = load i64, ptr addrspace(4) %"31", align 4
|
|
||||||
store i64 %"37", ptr addrspace(5) %"33", align 4
|
|
||||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
|
||||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"45" = inttoptr i64 %"40" to ptr
|
|
||||||
%"39" = load i64, ptr %"45", align 4
|
|
||||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"46" = xor i64 %"42", -1
|
|
||||||
store i64 %"46", ptr addrspace(5) %"36", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(5) %"34", align 4
|
|
||||||
%"44" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
%"48" = inttoptr i64 %"43" to ptr
|
|
||||||
store i64 %"44", ptr %"48", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,39 +0,0 @@
|
|||||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
|
||||||
|
|
||||||
define amdgpu_kernel void @ntid(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 {
|
|
||||||
%"37" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"38" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"39" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"40" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"32"
|
|
||||||
|
|
||||||
"32": ; preds = %1
|
|
||||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
|
||||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
|
||||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
|
||||||
%"44" = load i64, ptr addrspace(5) %"37", align 4
|
|
||||||
%"51" = inttoptr i64 %"44" to ptr
|
|
||||||
%"43" = load i32, ptr %"51", align 4
|
|
||||||
store i32 %"43", ptr addrspace(5) %"39", align 4
|
|
||||||
%"31" = call i32 @__zluda_ptx_impl_sreg_ntid(i8 0)
|
|
||||||
br label %"33"
|
|
||||||
|
|
||||||
"33": ; preds = %"32"
|
|
||||||
store i32 %"31", ptr addrspace(5) %"40", align 4
|
|
||||||
%"47" = load i32, ptr addrspace(5) %"39", align 4
|
|
||||||
%"48" = load i32, ptr addrspace(5) %"40", align 4
|
|
||||||
%"46" = add i32 %"47", %"48"
|
|
||||||
store i32 %"46", ptr addrspace(5) %"39", align 4
|
|
||||||
%"49" = load i64, ptr addrspace(5) %"38", align 4
|
|
||||||
%"50" = load i32, ptr addrspace(5) %"39", align 4
|
|
||||||
%"52" = inttoptr i64 %"49" to ptr
|
|
||||||
store i32 %"50", ptr %"52", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,36 +0,0 @@
|
|||||||
define amdgpu_kernel void @or(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
|
|
||||||
%"35" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"36" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"37" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"38" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"32"
|
|
||||||
|
|
||||||
"32": ; preds = %1
|
|
||||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
|
||||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
|
||||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"50" = inttoptr i64 %"42" to ptr
|
|
||||||
%"41" = load i64, ptr %"50", align 4
|
|
||||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"51" = inttoptr i64 %"43" to ptr
|
|
||||||
%"31" = getelementptr inbounds i8, ptr %"51", i64 8
|
|
||||||
%"44" = load i64, ptr %"31", align 4
|
|
||||||
store i64 %"44", ptr addrspace(5) %"38", align 4
|
|
||||||
%"46" = load i64, ptr addrspace(5) %"37", align 4
|
|
||||||
%"47" = load i64, ptr addrspace(5) %"38", align 4
|
|
||||||
%"52" = or i64 %"46", %"47"
|
|
||||||
store i64 %"52", ptr addrspace(5) %"37", align 4
|
|
||||||
%"48" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
%"49" = load i64, ptr addrspace(5) %"37", align 4
|
|
||||||
%"55" = inttoptr i64 %"48" to ptr
|
|
||||||
store i64 %"49", ptr %"55", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,33 +0,0 @@
|
|||||||
define amdgpu_kernel void @popc(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
|
||||||
%"32" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"29"
|
|
||||||
|
|
||||||
"29": ; preds = %1
|
|
||||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
|
||||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
|
||||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
|
||||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
|
||||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
|
||||||
%"43" = inttoptr i64 %"38" to ptr
|
|
||||||
%"37" = load i32, ptr %"43", align 4
|
|
||||||
store i32 %"37", ptr addrspace(5) %"34", align 4
|
|
||||||
%"40" = load i32, ptr addrspace(5) %"34", align 4
|
|
||||||
%"44" = call i32 @llvm.ctpop.i32(i32 %"40")
|
|
||||||
store i32 %"44", ptr addrspace(5) %"34", align 4
|
|
||||||
%"41" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"42" = load i32, ptr addrspace(5) %"34", align 4
|
|
||||||
%"45" = inttoptr i64 %"41" to ptr
|
|
||||||
store i32 %"42", ptr %"45", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
||||||
declare i32 @llvm.ctpop.i32(i32) #1
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
|
@ -1,57 +0,0 @@
|
|||||||
define amdgpu_kernel void @pred_not(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 {
|
|
||||||
%"43" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"44" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"45" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"46" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"47" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"48" = alloca i1, align 1, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"40"
|
|
||||||
|
|
||||||
"40": ; preds = %1
|
|
||||||
%"49" = load i64, ptr addrspace(4) %"41", align 4
|
|
||||||
store i64 %"49", ptr addrspace(5) %"43", align 4
|
|
||||||
%"50" = load i64, ptr addrspace(4) %"42", align 4
|
|
||||||
store i64 %"50", ptr addrspace(5) %"44", align 4
|
|
||||||
%"52" = load i64, ptr addrspace(5) %"43", align 4
|
|
||||||
%"66" = inttoptr i64 %"52" to ptr
|
|
||||||
%"51" = load i64, ptr %"66", align 4
|
|
||||||
store i64 %"51", ptr addrspace(5) %"45", align 4
|
|
||||||
%"53" = load i64, ptr addrspace(5) %"43", align 4
|
|
||||||
%"67" = inttoptr i64 %"53" to ptr
|
|
||||||
%"37" = getelementptr inbounds i8, ptr %"67", i64 8
|
|
||||||
%"54" = load i64, ptr %"37", align 4
|
|
||||||
store i64 %"54", ptr addrspace(5) %"46", align 4
|
|
||||||
%"56" = load i64, ptr addrspace(5) %"45", align 4
|
|
||||||
%"57" = load i64, ptr addrspace(5) %"46", align 4
|
|
||||||
%"55" = icmp ult i64 %"56", %"57"
|
|
||||||
store i1 %"55", ptr addrspace(5) %"48", align 1
|
|
||||||
%"59" = load i1, ptr addrspace(5) %"48", align 1
|
|
||||||
%"58" = xor i1 %"59", true
|
|
||||||
store i1 %"58", ptr addrspace(5) %"48", align 1
|
|
||||||
%"60" = load i1, ptr addrspace(5) %"48", align 1
|
|
||||||
br i1 %"60", label %"16", label %"17"
|
|
||||||
|
|
||||||
"16": ; preds = %"40"
|
|
||||||
store i64 1, ptr addrspace(5) %"47", align 4
|
|
||||||
br label %"17"
|
|
||||||
|
|
||||||
"17": ; preds = %"16", %"40"
|
|
||||||
%"62" = load i1, ptr addrspace(5) %"48", align 1
|
|
||||||
br i1 %"62", label %"19", label %"18"
|
|
||||||
|
|
||||||
"18": ; preds = %"17"
|
|
||||||
store i64 2, ptr addrspace(5) %"47", align 4
|
|
||||||
br label %"19"
|
|
||||||
|
|
||||||
"19": ; preds = %"18", %"17"
|
|
||||||
%"64" = load i64, ptr addrspace(5) %"44", align 4
|
|
||||||
%"65" = load i64, ptr addrspace(5) %"47", align 4
|
|
||||||
%"68" = inttoptr i64 %"64" to ptr
|
|
||||||
store i64 %"65", ptr %"68", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,38 +0,0 @@
|
|||||||
define amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
|
|
||||||
%"35" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"36" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"37" = alloca i32, align 4, addrspace(5)
|
|
||||||
%"38" = alloca i32, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"32"
|
|
||||||
|
|
||||||
"32": ; preds = %1
|
|
||||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
|
||||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
|
||||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
|
||||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
|
||||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"50" = inttoptr i64 %"42" to ptr
|
|
||||||
%"41" = load i32, ptr %"50", align 4
|
|
||||||
store i32 %"41", ptr addrspace(5) %"37", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
|
||||||
%"51" = inttoptr i64 %"43" to ptr
|
|
||||||
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
|
|
||||||
%"44" = load i32, ptr %"31", align 4
|
|
||||||
store i32 %"44", ptr addrspace(5) %"38", align 4
|
|
||||||
%"46" = load i32, ptr addrspace(5) %"37", align 4
|
|
||||||
%"47" = load i32, ptr addrspace(5) %"38", align 4
|
|
||||||
%2 = bitcast i32 %"46" to <4 x i8>
|
|
||||||
%3 = bitcast i32 %"47" to <4 x i8>
|
|
||||||
%"52" = shufflevector <4 x i8> %2, <4 x i8> %3, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
|
|
||||||
store <4 x i8> %"52", ptr addrspace(5) %"38", align 4
|
|
||||||
%"48" = load i64, ptr addrspace(5) %"36", align 4
|
|
||||||
%"49" = load i32, ptr addrspace(5) %"38", align 4
|
|
||||||
%"55" = inttoptr i64 %"48" to ptr
|
|
||||||
store i32 %"49", ptr %"55", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -1,33 +0,0 @@
|
|||||||
define amdgpu_kernel void @rcp(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
|
||||||
%"32" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"33" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"34" = alloca float, align 4, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"29"
|
|
||||||
|
|
||||||
"29": ; preds = %1
|
|
||||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
|
||||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
|
||||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
|
||||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
|
||||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
|
||||||
%"43" = inttoptr i64 %"38" to ptr
|
|
||||||
%"37" = load float, ptr %"43", align 4
|
|
||||||
store float %"37", ptr addrspace(5) %"34", align 4
|
|
||||||
%"40" = load float, ptr addrspace(5) %"34", align 4
|
|
||||||
%"39" = call float @llvm.amdgcn.rcp.f32(float %"40")
|
|
||||||
store float %"39", ptr addrspace(5) %"34", align 4
|
|
||||||
%"41" = load i64, ptr addrspace(5) %"33", align 4
|
|
||||||
%"42" = load float, ptr addrspace(5) %"34", align 4
|
|
||||||
%"44" = inttoptr i64 %"41" to ptr
|
|
||||||
store float %"42", ptr %"44", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
||||||
declare float @llvm.amdgcn.rcp.f32(float) #1
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
||||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
|
@ -1,36 +0,0 @@
|
|||||||
define amdgpu_kernel void @reg_local(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
|
|
||||||
%"10" = alloca [8 x i8], align 8, addrspace(5)
|
|
||||||
%"39" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"40" = alloca i64, align 8, addrspace(5)
|
|
||||||
%"41" = alloca i64, align 8, addrspace(5)
|
|
||||||
br label %1
|
|
||||||
|
|
||||||
1: ; preds = %0
|
|
||||||
br label %"36"
|
|
||||||
|
|
||||||
"36": ; preds = %1
|
|
||||||
%"42" = load i64, ptr addrspace(4) %"37", align 4
|
|
||||||
store i64 %"42", ptr addrspace(5) %"39", align 4
|
|
||||||
%"43" = load i64, ptr addrspace(4) %"38", align 4
|
|
||||||
store i64 %"43", ptr addrspace(5) %"40", align 4
|
|
||||||
%"45" = load i64, ptr addrspace(5) %"39", align 4
|
|
||||||
%"51" = inttoptr i64 %"45" to ptr addrspace(1)
|
|
||||||
%"50" = load i64, ptr addrspace(1) %"51", align 4
|
|
||||||
store i64 %"50", ptr addrspace(5) %"41", align 4
|
|
||||||
%"46" = load i64, ptr addrspace(5) %"41", align 4
|
|
||||||
%"31" = add i64 %"46", 1
|
|
||||||
%"52" = addrspacecast ptr addrspace(5) %"10" to ptr
|
|
||||||
store i64 %"31", ptr %"52", align 4
|
|
||||||
%"54" = addrspacecast ptr addrspace(5) %"10" to ptr
|
|
||||||
%"33" = getelementptr inbounds i8, ptr %"54", i64 0
|
|
||||||
%"55" = load i64, ptr %"33", align 4
|
|
||||||
store i64 %"55", ptr addrspace(5) %"41", align 4
|
|
||||||
%"48" = load i64, ptr addrspace(5) %"40", align 4
|
|
||||||
%"56" = inttoptr i64 %"48" to ptr addrspace(1)
|
|
||||||
%"35" = getelementptr inbounds i8, ptr addrspace(1) %"56", i64 0
|
|
||||||
%"49" = load i64, ptr addrspace(5) %"41", align 4
|
|
||||||
store i64 %"49", ptr addrspace(1) %"35", align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user