Add warp-wide tests (#400 )

Fix floating point min/max (#399 )
Read test files at runtime for development ergonomics (#395 )
2025-07-03 04:38:56 +03:00 · 2025-07-02 18:11:36 -07:00 · 2025-07-01 15:58:16 -07:00 · 2025-07-01 10:31:06 -07:00 · 2025-06-30 18:54:31 -07:00 · 2025-06-27 15:56:46 -07:00
274 changed files with 165965 additions and 12187 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@ -0,0 +1,2 @@
 [alias]
 xtask = "run --package xtask --"
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.4.1-base-ubuntu22.04
+FROM nvidia/cuda:12.8.1-base-ubuntu24.04
 RUN DEBIAN_FRONTEND=noninteractive apt-get update -y && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
    wget \
@ -18,10 +18,14 @@ RUN wget https://apt.llvm.org/llvm.sh && \
    ./llvm.sh ${LLVM_VERSION}
 # Feel free to change to a newer version if you have a newer verison on your host
-ARG CUDA_PKG_VERSION=12-4
+ARG CUDA_PKG_VERSION=12-8
 # Docker <-> host  driver version compatiblity is newer host <-> older docker
-# We don't care about a specific driver version, so pick oldest 5XX
+# We don't care about a specific driver version, so pick oldest 5XX compatible
-ARG CUDA_DRIVER=515
+ARG CUDA_DRIVER=570
 RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcudnn8_8.9.7.29-1+cuda12.2_amd64.deb && \
    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcudnn8-dev_8.9.7.29-1+cuda12.2_amd64.deb && \
    dpkg -i libcudnn8_8.9.7.29-1+cuda12.2_amd64.deb libcudnn8-dev_8.9.7.29-1+cuda12.2_amd64.deb && \
    rm libcudnn8_8.9.7.29-1+cuda12.2_amd64.deb libcudnn8-dev_8.9.7.29-1+cuda12.2_amd64.deb
 RUN DEBIAN_FRONTEND=noninteractive apt-get update -y && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
    # CUDA headers need it for interop
    libgl-dev libegl-dev libvdpau-dev \
@ -30,20 +34,25 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get update -y && DEBIAN_FRONTEND=noninter
    cuda-nvml-dev-${CUDA_PKG_VERSION} \
    cuda-cudart-${CUDA_PKG_VERSION} \
    cuda-profiler-api-${CUDA_PKG_VERSION} \
-    cuda-nvcc-${CUDA_PKG_VERSION}
+    cuda-nvcc-${CUDA_PKG_VERSION} \
    libcudnn8-dev \
    cudnn9-cuda-${CUDA_PKG_VERSION} \
    libcufft-dev-${CUDA_PKG_VERSION} \
    libcublas-dev-${CUDA_PKG_VERSION} \
    libcusparse-dev-${CUDA_PKG_VERSION}
-ARG ROCM_VERSION=6.2.2
+ARG ROCM_VERSION=6.4
 RUN mkdir --parents --mode=0755 /etc/apt/keyrings && \
    wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
    gpg --dearmor | tee /etc/apt/keyrings/rocm.gpg > /dev/null && \
-    echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${ROCM_VERSION} jammy main" > /etc/apt/sources.list.d/rocm.list && \
+    echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${ROCM_VERSION} noble main" > /etc/apt/sources.list.d/rocm.list && \
    echo 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' > /etc/apt/preferences.d/rocm-pin-600 && \
    DEBIAN_FRONTEND=noninteractive apt update -y && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
    rocminfo \
    rocm-gdb \
    rocm-smi-lib \
    rocm-llvm-dev \
-    hip-runtime-amd && \
+    hip-runtime-amd \
    hip-dev && \
    echo '/opt/rocm/lib' > /etc/ld.so.conf.d/rocm.conf && \
    ldconfig
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@ -7,7 +7,7 @@
 	},
 	"securityOpt": [ "seccomp=unconfined" ],
 	"runArgs": [
-		"--runtime=nvidia",
+		//"--runtime=nvidia",
 		"--device=/dev/kfd",
 		"--device=/dev/dri",
 		"--group-add=video"
@ -25,7 +25,7 @@
 	},
 	// https://aka.ms/dev-containers-non-root.
 	"remoteUser": "root",
-	//"hostRequirements": { "gpu": "optional" }
+	"hostRequirements": { "gpu": true },
 	"customizations": {
 		"vscode": {
 			"extensions": [ "mhutchie.git-graph" ]
--- a/.github/workflows/pr_master.yml
+++ b/.github/workflows/pr_master.yml
@ -0,0 +1,65 @@
 name: ZLUDA
 on:
  pull_request:
    branches: [ master ]
 env:
  CARGO_TERM_COLOR: always
  CARGO_PROFILE: release
  ROCM_VERSION: "6.3.1"
 jobs:
  build_linux:
    name: Build (Linux)
    runs-on: ubuntu-22.04
    steps:
    - uses: jlumbroso/free-disk-space@main
    - name: Install ROCm
      run: |
        sudo mkdir --parents --mode=0755 /etc/apt/keyrings
        sudo sh -c 'wget https://repo.radeon.com/rocm/rocm.gpg.key -O - |  gpg --dearmor | tee /etc/apt/keyrings/rocm.gpg > /dev/null'
        sudo sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${{ env.ROCM_VERSION }} jammy main > /etc/apt/sources.list.d/rocm.list'
        sudo sh -c 'echo Package: * > /etc/apt/preferences.d/rocm-pin-600'
        sudo sh -c 'echo Pin: release o=repo.radeon.com >> /etc/apt/preferences.d/rocm-pin-600'
        sudo sh -c 'echo Pin-Priority: 600 >> /etc/apt/preferences.d/rocm-pin-600'
        sudo apt-get update
        sudo DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rocm-smi-lib rocm-llvm-dev hip-runtime-amd hip-dev
        echo 'export PATH="$PATH:/opt/rocm/bin"' | sudo tee /etc/profile.d/rocm.sh  
        echo '/opt/rocm/lib' | sudo tee /etc/ld.so.conf.d/rocm.conf
        sudo ldconfig
    - uses: actions/checkout@v4
      with:
        submodules: true
    - uses: Swatinem/rust-cache@v2
    - name: Build
      # https://github.com/actions/upload-artifact/issues/39
      run: |
        cargo xtask zip --profile ${{ env.CARGO_PROFILE }}
        mkdir target/${{ env.CARGO_PROFILE }}/zluda
        tar -xzf target/${{ env.CARGO_PROFILE }}/zluda.tar.gz -C target/${{ env.CARGO_PROFILE }}/zluda
    - name: Set revision hash
      run: echo "SHORT_SHA=$(git rev-parse --short HEAD)" >> $GITHUB_ENV
    - name: Upload
      uses: actions/upload-artifact@v4
      with:
        name: zluda-linux-${{ env.SHORT_SHA }}
        path: target/${{ env.CARGO_PROFILE }}/zluda
  build_windows:
    name: Build (Windows)
    runs-on: windows-2022
    steps:
    - uses: actions/checkout@v4
      with:
        submodules: true
    - uses: Swatinem/rust-cache@v2
    - name: Build
      run: |
        cargo xtask zip --profile ${{ env.CARGO_PROFILE }}
        Expand-Archive -Path target/${{ env.CARGO_PROFILE }}/zluda.zip -DestinationPath target/${{ env.CARGO_PROFILE }}/zluda
    - name: Set revision hash
      run: echo "SHORT_SHA=$("${{ github.sha }}".SubString(0, 7))" >> $env:GITHUB_ENV
    - name: Upload
      uses: actions/upload-artifact@v4
      with:
        name: zluda-windows-${{ env.SHORT_SHA }}
        path: target/${{ env.CARGO_PROFILE }}/zluda
--- a/.github/workflows/push_master.yml
+++ b/.github/workflows/push_master.yml
@ -0,0 +1,65 @@
 name: ZLUDA
 on:
  push:
    branches: [ master ]
 env:
  CARGO_TERM_COLOR: always
  CARGO_PROFILE: release-lto
  ROCM_VERSION: "6.3.1"
 jobs:
  build_linux:
    name: Build (Linux)
    runs-on: ubuntu-22.04
    steps:
    - uses: jlumbroso/free-disk-space@main
    - name: Install ROCm
      run: |
        sudo mkdir --parents --mode=0755 /etc/apt/keyrings
        sudo sh -c 'wget https://repo.radeon.com/rocm/rocm.gpg.key -O - |  gpg --dearmor | tee /etc/apt/keyrings/rocm.gpg > /dev/null'
        sudo sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${{ env.ROCM_VERSION }} jammy main > /etc/apt/sources.list.d/rocm.list'
        sudo sh -c 'echo Package: * > /etc/apt/preferences.d/rocm-pin-600'
        sudo sh -c 'echo Pin: release o=repo.radeon.com >> /etc/apt/preferences.d/rocm-pin-600'
        sudo sh -c 'echo Pin-Priority: 600 >> /etc/apt/preferences.d/rocm-pin-600'
        sudo apt-get update
        sudo DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rocm-smi-lib rocm-llvm-dev hip-runtime-amd hip-dev
        echo 'export PATH="$PATH:/opt/rocm/bin"' | sudo tee /etc/profile.d/rocm.sh  
        echo '/opt/rocm/lib' | sudo tee /etc/ld.so.conf.d/rocm.conf
        sudo ldconfig
    - uses: actions/checkout@v4
      with:
        submodules: true
    - uses: Swatinem/rust-cache@v2
    - name: Build
      # https://github.com/actions/upload-artifact/issues/39
      run: |
        cargo xtask zip --profile ${{ env.CARGO_PROFILE }}
        mkdir target/${{ env.CARGO_PROFILE }}/zluda
        tar -xzf target/${{ env.CARGO_PROFILE }}/zluda.tar.gz -C target/${{ env.CARGO_PROFILE }}/zluda
    - name: Set revision hash
      run: echo "SHORT_SHA=$(git rev-parse --short HEAD)" >> $GITHUB_ENV
    - name: Upload
      uses: actions/upload-artifact@v4
      with:
        name: zluda-linux-${{ env.SHORT_SHA }}
        path: target/${{ env.CARGO_PROFILE }}/zluda
  build_windows:
    name: Build (Windows)
    runs-on: windows-2022
    steps:
    - uses: actions/checkout@v4
      with:
        submodules: true
    - uses: Swatinem/rust-cache@v2
    - name: Build
      run: |
        cargo xtask zip --profile ${{ env.CARGO_PROFILE }}
        Expand-Archive -Path target/${{ env.CARGO_PROFILE }}/zluda.zip -DestinationPath target/${{ env.CARGO_PROFILE }}/zluda
    - name: Set revision hash
      run: echo "SHORT_SHA=$("${{ github.sha }}".SubString(0, 7))" >> $env:GITHUB_ENV
    - name: Upload
      uses: actions/upload-artifact@v4
      with:
        name: zluda-windows-${{ env.SHORT_SHA }}
        path: target/${{ env.CARGO_PROFILE }}/zluda
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,61 +0,0 @@
 # Dependencies
 Development builds of ZLUDA requires following dependencies:
 * CMake
 * Python 3
 Additionally the repository has to be cloned with Git submodules initalized. If you cloned the repo without initalizing submodules, do this:
 ```
 git submodule update --init --recursive
 ```
 # Tests
 Tests should be executed with `--workspace` option to test non-default targets:
 ```
 cargo test --workspace
 ```
 # Debugging
 ## Debuggging CUDA applications
 When running an application with ZLUDA quite often you will run into subtle bugs or incompatibilities in the generated GPU code. The best way to debug an application's GPU CUDA code is to use ZLUDA dumper.
 Library `zluda_dump` can be injected into a CUDA application and produce a trace which, for every launched GPU function contains:
 * PTX source
 * Launch arguments (block size, grid size, shared memory size)
 * Dump of function arguments. Both after and before
 Example use with GeekBench:
 ```
 set ZLUDA_DUMP_KERNEL=knn_match
 set ZLUDA_DUMP_DIR=C:\temp\zluda_dump
 "<ZLUDA_PATH>\zluda_with.exe" "<ZLUDA_PATH>\zluda_dump.dll" -- "geekbench_x86_64.exe" --compute CUDA
 ```
 The example above, for every execution of GPU function `knn_match`, will save its details into the directory `C:\temp\zluda_dump`
 This dump can be replayed with `replay.py` script from `zluda_dump` source directory. Use it like this:
 ```
 python replay.py "C:\temp\zluda_dump\geekbench_x86_64.exe"
 ```
 You must copy (or symlink) ZLUDA `nvcuda.dll` into PyCUDA directory, so it will run using ZLUDA. Example output:
 ```
 Intel(R) Graphics [0x3e92] [github.com/vosen/ZLUDA]
 C:\temp\zluda_dump\geekbench_x86_64.exe\4140_scale_pyramid
 C:\temp\zluda_dump\geekbench_x86_64.exe\4345_convolve_1d_vertical_grayscale
    Skipping, launch block size (512) bigger than maximum block size (256)
 C:\temp\zluda_dump\geekbench_x86_64.exe\4480_scale_pyramid
 6: 
 Arrays are not equal
 Mismatched elements: 1200 / 19989588 (0.006%)
 Max absolute difference: 255
 Max relative difference: 255.
 x: array([  7,   6,   8, ..., 193, 195, 193], dtype=uint8)
 y: array([  7,   6,   8, ..., 193, 195, 193], dtype=uint8)
 ```
 From this output one can observe that in kernel launch 4480, 6th argument to function `scale_pyramid` differs between what was executed on an NVIDIA GPU using CUDA and Intel GPU using ZLUDA.  
 __Important__: It's impossible to infer what was the type (and semantics) of argument passed to a GPU function. At our level it's a buffer of bytes and by default `replay.py` simply checks if two buffers are byte-equal. That means you will have a ton of false negatives when running  `replay.py`. You should override them for your particular case in `replay.py` - it already contains some overrides for GeekBench kernels
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -3,22 +3,44 @@
 resolver = "2"
 members = [
    "ext/hip_runtime-sys",
    "ext/amd_comgr-sys",
    "comgr",
    "cuda_base",
    "cuda_types",
    "dark_api",
    "detours-sys",
-    "zluda",
+    "ext/amd_comgr-sys",
-    "zluda_dump",
+    "ext/hip_runtime-sys",
-    "zluda_inject",
+    "format",
    "zluda_redirect",
    "zluda_ml",
    "ptx",
    "ptx_parser",
    "ptx_parser_macros",
    "ptx_parser_macros_impl",
    "xtask",
    "zluda",
    "zluda_bindgen",
    "zluda_blas",
    "zluda_blaslt",
    "zluda_dnn",
    "zluda_dump",
    "zluda_dump_blas",
    "zluda_dump_blaslt",
    "zluda_dump_common",
    "zluda_dump_dnn",
    "zluda_dump_fft",
    "zluda_dump_sparse",
    "zluda_fft",
    "zluda_inject",
    "zluda_ml",
    "zluda_redirect",
    "zluda_sparse",
 ]
 default-members = ["zluda", "zluda_ml", "zluda_inject", "zluda_redirect"]
 [profile.release-lto]
 inherits = "release"
 codegen-units = 1
 lto = true
 [profile.dev.package.xtask]
 opt-level = 2
--- a/GeekBench_5_2_3.svg
+++ b/GeekBench_5_2_3.svg
--- a/README.md
+++ b/README.md
@ -4,18 +4,23 @@
 ZLUDA is a drop-in replacement for CUDA on non-NVIDIA GPU. ZLUDA allows to run unmodified CUDA applications using non-NVIDIA GPUs with near-native performance.
 ZLUDA supports AMD Radeon RX 5000 series and newer GPUs (both desktop and integrated).
 ![GeekBench 5.5.1 chart](geekbench.svg)
 ZLUDA is work in progress. Follow development here and say hi on [Discord](https://discord.gg/sg6BNzXuc7). For more details see the announcement: https://vosen.github.io/ZLUDA/blog/zludas-third-life/
 ## Usage
-**Warning**: ZLUDA is under heavy development (see news [here](https://vosen.github.io/ZLUDA/blog/zludas-third-life/)). Instructions below might not work.
+**Warning**: This version ZLUDA is under heavy development (more [here](https://vosen.github.io/ZLUDA/blog/zludas-third-life/)) and right now only supports Geekbench. ZLUDA probably will not work with your application just yet.
 ### Windows
-You should have the most recent ROCm  installed.\
+You should have recent AMD GPU driver ("AMD Software: Adrenalin Edition") installed.\
-Run your application like this:
+To run your application you should etiher:
-```
+* (Recommended approach) Copy ZLUDA-provided `nvcuda.dll` and `nvml.dll` from `target\release` (if built from sources) or `zluda` (if downloaded a zip package) into a path which your application uses to load CUDA. Paths vary application to application, but usually it's the directory where the .exe file is located
-<ZLUDA_DIRECTORY>\zluda_with.exe -- <APPLICATION> <APPLICATIONS_ARGUMENTS>
+* Use ZLUDA launcher like below. ZLUDA launcher is known to be buggy and incomplete:
-```
+    ```
    <ZLUDA_DIRECTORY>\zluda_with.exe -- <APPLICATION> <APPLICATIONS_ARGUMENTS>
    ```
 ### Linux
@ -24,33 +29,44 @@ Run your application like this:
 LD_LIBRARY_PATH=<ZLUDA_DIRECTORY> <APPLICATION> <APPLICATIONS_ARGUMENTS>
 ```
 where `<ZLUDA_DIRECTORY>` is the directory which contains ZLUDA-provided `libcuda.so`: `target/release` if you built from sources or `zluda` if you downloaded prebuilt package.
 ### MacOS
 Not supported
 ## Building
 **Warning**: ZLUDA is under heavy development (see news [here](https://vosen.github.io/ZLUDA/blog/zludas-third-life/)). Instructions below might not work.
-_Note_: This repo has submodules. Make sure to recurse submodules when cloning this repo, e.g.: `git clone --recursive https://github.com/vosen/ZLUDA.git`
+### Dependencies
- You should have a relatively recent version of Rust installed, then you just do:
+* Git
 * CMake
 * Python 3
 * Rust compiler (recent version)
 * C++ compiler
 * (Optional, but recommended) [Ninja build system](https://ninja-build.org/)
-```
+### Build steps
 cargo build --release
 ```
 in the main directory of the project.  
 ### Linux
-If you are building on Linux you must also symlink (or rename) the ZLUDA output binaries after ZLUDA build finishes:
+* Git clone the repo (make sure to use `--recursive` option to fetch submodules):  
-```
+`git clone --recursive https://github.com/vosen/ZLUDA.git`  
-ln -s libnvcuda.so target/release/libcuda.so
+* Enter freshly cloned `ZLUDA` directory and build with cargo (this takes a while):  
-ln -s libnvcuda.so target/release/libcuda.so.1
+`cargo xtask --release`
 ln -s libnvml.so target/release/libnvidia-ml.so
 ```
 ## Contributing
-If you want to develop ZLUDA itself, read [CONTRIBUTING.md](CONTRIBUTING.md), it contains instructions how to set up dependencies and run tests
+ZLUDA project has a commercial backing and _does not_ accept donations.
 ZLUDA project accepts pull requests and other non-monetary contributions.
 If you want to contribute a code fix or documentation update feel free to open a Pull Request.
 ### Getting started
 There's no architecture document (yet). Two most important crates in ZLUDA are `ptx` (PTX compiler) and `zluda` (AMD GPU runtime). A good starting point to tinkering the project is to run one of the `ptx` unit tests under a debugger and understand what it is doing. `cargo test -p ptx -- ::add_hip` is a simple test that adds two numbers.
 Github issues tagged with ["help wanted"](https://github.com/vosen/ZLUDA/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) are tasks that are self-containted. Their level of difficulty varies, they are not always good beginner tasks, but they defined unambiguously.
 If you have questions feel free to ask on [#devtalk channel on Discord](https://discord.com/channels/1273316903783497778/1303329281409159270).
 ## License
--- a/comgr/Cargo.toml
+++ b/comgr/Cargo.toml
@ -7,4 +7,5 @@ edition = "2021"
 [lib]
 [dependencies]
-amd_comgr-sys = { path = "../ext/amd_comgr-sys" }
+amd_comgr-sys = { path = "../ext/amd_comgr-sys" }
 libloading = "0.8"
--- a/comgr/src/lib.rs
+++ b/comgr/src/lib.rs
@ -1,153 +1,214 @@
 use amd_comgr_sys::*;
 use std::{ffi::CStr, mem, ptr};
-struct Data(amd_comgr_data_t);
+macro_rules! call_dispatch_arg {
    (2, $arg:ident) => {
        $arg.comgr2()
    };
    (2, $arg:tt) => {
        #[allow(unused_braces)]
        $arg
    };
    (3, $arg:ident) => {
        $arg.comgr3()
    };
    (3, $arg:tt) => {
        #[allow(unused_braces)]
        $arg
    };
 }
 macro_rules! call_dispatch {
    ($src:expr => $fn_:ident( $($arg:tt),+ )) => {
        match $src {
            Comgr::V2(this) => unsafe { this. $fn_(
                $(
                    call_dispatch_arg!(2, $arg),
                )+
            ) }?,
            Comgr::V3(this) => unsafe { this. $fn_(
                $(
                    call_dispatch_arg!(3, $arg),
                )+
            ) }?,
        }
    };
 }
 macro_rules! comgr_owned {
    ($name:ident, $comgr_type:ident, $ctor:ident, $dtor:ident) => {
        struct $name<'a> {
            handle: u64,
            comgr: &'a Comgr,
        }
        impl<'a> $name<'a> {
            fn new(comgr: &'a Comgr) -> Result<Self, Error> {
                let handle = match comgr {
                    Comgr::V2(comgr) => {
                        let mut result = unsafe { mem::zeroed() };
                        unsafe { comgr.$ctor(&mut result)? };
                        result.handle
                    }
                    Comgr::V3(comgr) => {
                        let mut result = unsafe { mem::zeroed() };
                        unsafe { comgr.$ctor(&mut result)? };
                        result.handle
                    }
                };
                Ok(Self { handle, comgr })
            }
            fn comgr2(&self) -> amd_comgr_sys::comgr2::$comgr_type {
                amd_comgr_sys::comgr2::$comgr_type {
                    handle: self.handle,
                }
            }
            fn comgr3(&self) -> amd_comgr_sys::comgr3::$comgr_type {
                amd_comgr_sys::comgr3::$comgr_type {
                    handle: self.handle,
                }
            }
        }
        impl<'a> Drop for $name<'a> {
            fn drop(&mut self) {
                match self.comgr {
                    Comgr::V2(comgr) => {
                        unsafe {
                            comgr.$dtor(amd_comgr_sys::comgr2::$comgr_type {
                                handle: self.handle,
                            })
                        }
                        .ok();
                    }
                    Comgr::V3(comgr) => {
                        unsafe {
                            comgr.$dtor(amd_comgr_sys::comgr3::$comgr_type {
                                handle: self.handle,
                            })
                        }
                        .ok();
                    }
                }
            }
        }
    };
 }
 comgr_owned!(
    ActionInfo,
    amd_comgr_action_info_t,
    amd_comgr_create_action_info,
    amd_comgr_destroy_action_info
 );
 impl<'a> ActionInfo<'a> {
    fn set_isa_name(&self, isa: &CStr) -> Result<(), Error> {
        let mut full_isa = "amdgcn-amd-amdhsa--".to_string().into_bytes();
        full_isa.extend(isa.to_bytes_with_nul());
        call_dispatch!(self.comgr => amd_comgr_action_info_set_isa_name(self, { full_isa.as_ptr().cast() }));
        Ok(())
    }
    fn set_language(&self, language: Language) -> Result<(), Error> {
        call_dispatch!(self.comgr => amd_comgr_action_info_set_language(self, language));
        Ok(())
    }
    fn set_options<'b>(&self, options: impl Iterator<Item = &'b CStr>) -> Result<(), Error> {
        let options = options.map(|x| x.as_ptr()).collect::<Vec<_>>();
        call_dispatch!(self.comgr => amd_comgr_action_info_set_option_list(self, { options.as_ptr().cast_mut() }, { options.len() }));
        Ok(())
    }
 }
 comgr_owned!(
    DataSet,
    amd_comgr_data_set_t,
    amd_comgr_create_data_set,
    amd_comgr_destroy_data_set
 );
 impl<'a> DataSet<'a> {
    fn add(&self, data: &Data) -> Result<(), Error> {
        call_dispatch!(self.comgr => amd_comgr_data_set_add(self, data));
        Ok(())
    }
    fn get_data(&self, kind: DataKind, index: usize) -> Result<Data, Error> {
        let mut handle = 0u64;
        call_dispatch!(self.comgr => amd_comgr_action_data_get_data(self, kind, { index }, { std::ptr::from_mut(&mut handle).cast() }));
        Ok(Data(handle))
    }
 }
 struct Data(u64);
 impl Data {
-    fn new(
+    fn new(comgr: &Comgr, kind: DataKind, name: &CStr, content: &[u8]) -> Result<Self, Error> {
-        kind: amd_comgr_data_kind_t,
+        let mut handle = 0u64;
-        name: &CStr,
+        call_dispatch!(comgr => amd_comgr_create_data(kind, { std::ptr::from_mut(&mut handle).cast() }));
-        content: &[u8],
+        let data = Data(handle);
-    ) -> Result<Self, amd_comgr_status_s> {
+        call_dispatch!(comgr => amd_comgr_set_data_name(data, { name.as_ptr() }));
-        let mut data = unsafe { mem::zeroed() };
+        call_dispatch!(comgr => amd_comgr_set_data(data, { content.len() }, { content.as_ptr().cast() }));
-        unsafe { amd_comgr_create_data(kind, &mut data) }?;
+        Ok(data)
        unsafe { amd_comgr_set_data_name(data, name.as_ptr()) }?;
        unsafe { amd_comgr_set_data(data, content.len(), content.as_ptr().cast()) }?;
        Ok(Self(data))
    }
-    fn get(&self) -> amd_comgr_data_t {
+    fn comgr2(&self) -> comgr2::amd_comgr_data_t {
-        self.0
+        comgr2::amd_comgr_data_s { handle: self.0 }
    }
-    fn copy_content(&self) -> Result<Vec<u8>, amd_comgr_status_s> {
+    fn comgr3(&self) -> comgr3::amd_comgr_data_t {
        comgr3::amd_comgr_data_s { handle: self.0 }
    }
    fn copy_content(&self, comgr: &Comgr) -> Result<Vec<u8>, Error> {
        let mut size = unsafe { mem::zeroed() };
-        unsafe { amd_comgr_get_data(self.get(), &mut size, ptr::null_mut()) }?;
+        call_dispatch!(comgr => amd_comgr_get_data(self, { &mut size }, { ptr::null_mut() }));
        let mut result: Vec<u8> = Vec::with_capacity(size);
        unsafe { result.set_len(size) };
-        unsafe { amd_comgr_get_data(self.get(), &mut size, result.as_mut_ptr().cast()) }?;
+        call_dispatch!(comgr => amd_comgr_get_data(self, { &mut size }, { result.as_mut_ptr().cast() }));
        Ok(result)
    }
 }
 struct DataSet(amd_comgr_data_set_t);
 impl DataSet {
    fn new() -> Result<Self, amd_comgr_status_s> {
        let mut data_set = unsafe { mem::zeroed() };
        unsafe { amd_comgr_create_data_set(&mut data_set) }?;
        Ok(Self(data_set))
    }
    fn add(&self, data: &Data) -> Result<(), amd_comgr_status_s> {
        unsafe { amd_comgr_data_set_add(self.get(), data.get()) }
    }
    fn get(&self) -> amd_comgr_data_set_t {
        self.0
    }
    fn get_data(
        &self,
        kind: amd_comgr_data_kind_t,
        index: usize,
    ) -> Result<Data, amd_comgr_status_s> {
        let mut data = unsafe { mem::zeroed() };
        unsafe { amd_comgr_action_data_get_data(self.get(), kind, index, &mut data) }?;
        Ok(Data(data))
    }
 }
 impl Drop for DataSet {
    fn drop(&mut self) {
        unsafe { amd_comgr_destroy_data_set(self.get()).ok() };
    }
 }
 struct ActionInfo(amd_comgr_action_info_t);
 impl ActionInfo {
    fn new() -> Result<Self, amd_comgr_status_s> {
        let mut action = unsafe { mem::zeroed() };
        unsafe { amd_comgr_create_action_info(&mut action) }?;
        Ok(Self(action))
    }
    fn set_isa_name(&self, isa: &CStr) -> Result<(), amd_comgr_status_s> {
        let mut full_isa = "amdgcn-amd-amdhsa--".to_string().into_bytes();
        full_isa.extend(isa.to_bytes_with_nul());
        unsafe { amd_comgr_action_info_set_isa_name(self.get(), full_isa.as_ptr().cast()) }
    }
    fn set_language(&self, language: amd_comgr_language_t) -> Result<(), amd_comgr_status_s> {
        unsafe { amd_comgr_action_info_set_language(self.get(), language) }
    }
    fn set_options<'a>(
        &self,
        options: impl Iterator<Item = &'a CStr>,
    ) -> Result<(), amd_comgr_status_s> {
        let options = options.map(|x| x.as_ptr()).collect::<Vec<_>>();
        unsafe {
            amd_comgr_action_info_set_option_list(
                self.get(),
                options.as_ptr().cast_mut(),
                options.len(),
            )
        }
    }
    fn get(&self) -> amd_comgr_action_info_t {
        self.0
    }
 }
 impl Drop for ActionInfo {
    fn drop(&mut self) {
        unsafe { amd_comgr_destroy_action_info(self.get()).ok() };
    }
 }
 pub fn compile_bitcode(
    comgr: &Comgr,
    gcn_arch: &CStr,
    main_buffer: &[u8],
    ptx_impl: &[u8],
-) -> Result<Vec<u8>, amd_comgr_status_s> {
+) -> Result<Vec<u8>, Error> {
-    use amd_comgr_sys::*;
+    let bitcode_data_set = DataSet::new(comgr)?;
-    let bitcode_data_set = DataSet::new()?;
+    let main_bitcode_data = Data::new(comgr, DataKind::Bc, c"zluda.bc", main_buffer)?;
    let main_bitcode_data = Data::new(
        amd_comgr_data_kind_t::AMD_COMGR_DATA_KIND_BC,
        c"zluda.bc",
        main_buffer,
    )?;
    bitcode_data_set.add(&main_bitcode_data)?;
-    let stdlib_bitcode_data = Data::new(
+    let stdlib_bitcode_data = Data::new(comgr, DataKind::Bc, c"ptx_impl.bc", ptx_impl)?;
        amd_comgr_data_kind_t::AMD_COMGR_DATA_KIND_BC,
        c"ptx_impl.bc",
        ptx_impl,
    )?;
    bitcode_data_set.add(&stdlib_bitcode_data)?;
-    let linking_info = ActionInfo::new()?;
+    let linking_info = ActionInfo::new(comgr)?;
-    let linked_data_set = do_action(
+    comgr.do_action(ActionKind::LinkBcToBc, &linking_info, &bitcode_data_set)?;
-        &bitcode_data_set,
+    let linked_data_set =
-        &linking_info,
+        comgr.do_action(ActionKind::LinkBcToBc, &linking_info, &bitcode_data_set)?;
-        amd_comgr_action_kind_t::AMD_COMGR_ACTION_LINK_BC_TO_BC,
+    let compile_to_exec = ActionInfo::new(comgr)?;
-    )?;
+    compile_to_exec.set_isa_name(gcn_arch)?;
-    let link_with_device_libs_info = ActionInfo::new()?;
+    compile_to_exec.set_language(Language::LlvmIr)?;
-    link_with_device_libs_info.set_isa_name(gcn_arch)?;
+    let common_options = [
-    link_with_device_libs_info.set_language(amd_comgr_language_t::AMD_COMGR_LANGUAGE_LLVM_IR)?;
+        // This makes no sense, but it makes ockl linking work
-    // This makes no sense, but it makes ockl linking work
+        c"-Xclang",
-    link_with_device_libs_info
+        c"-mno-link-builtin-bitcode-postopt",
-        .set_options([c"-Xclang", c"-mno-link-builtin-bitcode-postopt"].into_iter())?;
+        // Otherwise LLVM omits dynamic fp mode for ockl functions during linking
-    let with_device_libs = do_action(
+        // and then fails to inline them
-        &linked_data_set,
+        c"-Xclang",
-        &link_with_device_libs_info,
+        c"-fdenormal-fp-math=dynamic",
-        amd_comgr_action_kind_t::AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC,
+        c"-O3",
-    )?;
+        c"-mno-wavefrontsize64",
-    let compile_action_info = ActionInfo::new()?;
+        c"-mcumode",
-    compile_action_info.set_isa_name(gcn_arch)?;
+        // Useful for inlining reports, combined with AMD_COMGR_SAVE_TEMPS=1 AMD_COMGR_EMIT_VERBOSE_LOGS=1 AMD_COMGR_REDIRECT_LOGS=stderr
-    let common_options = [c"-O3", c"-mno-wavefrontsize64", c"-mcumode"].into_iter();
+        // c"-fsave-optimization-record=yaml",
    ]
    .into_iter();
    let opt_options = if cfg!(debug_assertions) {
        //[c"-g", c"-mllvm", c"-print-before-all", c"", c""]
        [c"-g", c"", c"", c"", c""]
    } else {
        [
@ -159,28 +220,186 @@ pub fn compile_bitcode(
            c"-inlinehint-threshold=3250",
        ]
    };
-    compile_action_info.set_options(common_options.chain(opt_options))?;
+    compile_to_exec.set_options(common_options.chain(opt_options))?;
-    let reloc_data_set = do_action(
+    let exec_data_set = comgr.do_action(
-        &with_device_libs,
+        ActionKind::CompileSourceToExecutable,
-        &compile_action_info,
+        &compile_to_exec,
-        amd_comgr_action_kind_t::AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE,
+        &linked_data_set,
    )?;
-    let exec_data_set = do_action(
+    let executable = exec_data_set.get_data(DataKind::Executable, 0)?;
-        &reloc_data_set,
+    executable.copy_content(comgr)
        &compile_action_info,
        amd_comgr_action_kind_t::AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE,
    )?;
    let executable =
        exec_data_set.get_data(amd_comgr_data_kind_t::AMD_COMGR_DATA_KIND_EXECUTABLE, 0)?;
    executable.copy_content()
 }
-fn do_action(
+pub enum Comgr {
-    data_set: &DataSet,
+    V2(amd_comgr_sys::comgr2::Comgr2),
-    action: &ActionInfo,
+    V3(amd_comgr_sys::comgr3::Comgr3),
-    kind: amd_comgr_action_kind_t,
+}
-) -> Result<DataSet, amd_comgr_status_s> {
+
-    let result = DataSet::new()?;
+impl Comgr {
-    unsafe { amd_comgr_do_action(kind, action.get(), data_set.get(), result.get()) }?;
+    pub fn new() -> Result<Self, Error> {
-    Ok(result)
+        unsafe { libloading::Library::new(os::COMGR3) }
            .and_then(|lib| {
                Ok(Comgr::V3(unsafe {
                    amd_comgr_sys::comgr3::Comgr3::from_library(lib)?
                }))
            })
            .or_else(|_| {
                unsafe { libloading::Library::new(os::COMGR2) }.and_then(|lib| {
                    Ok(if Self::is_broken_v2(&lib) {
                        Comgr::V3(unsafe { amd_comgr_sys::comgr3::Comgr3::from_library(lib)? })
                    } else {
                        Comgr::V2(unsafe { amd_comgr_sys::comgr2::Comgr2::from_library(lib)? })
                    })
                })
            })
            .map_err(Into::into)
    }
    // For reasons unknown, on AMD Adrenalin 25.5.1, AMD ships amd_comgr_2.dll that shows up as
    // version 2.9.0, but actually uses the 3.X ABI. This is our best effort to detect it.
    // Version 25.3.1 returns 2.8.0, which seem to be the last version that actually uses the 2 ABI
    fn is_broken_v2(lib: &libloading::Library) -> bool {
        if cfg!(not(windows)) {
            return false;
        }
        let amd_comgr_get_version = match unsafe {
            lib.get::<unsafe extern "C" fn(major: *mut usize, minor: *mut usize)>(
                b"amd_comgr_get_version\0",
            )
        } {
            Ok(symbol) => symbol,
            Err(_) => return false,
        };
        let mut major = 0;
        let mut minor = 0;
        unsafe { (amd_comgr_get_version)(&mut major, &mut minor) };
        (major, minor) >= (2, 9)
    }
    fn do_action(
        &self,
        kind: ActionKind,
        action: &ActionInfo,
        data_set: &DataSet,
    ) -> Result<DataSet, Error> {
        let result = DataSet::new(self)?;
        call_dispatch!(self => amd_comgr_do_action(kind, action, data_set, result));
        Ok(result)
    }
 }
 #[derive(Debug)]
 pub struct Error(pub ::std::num::NonZeroU32);
 impl Error {
    #[doc = " A generic error has occurred."]
    pub const UNKNOWN: Error = Error(unsafe { ::std::num::NonZeroU32::new_unchecked(1) });
    #[doc = " One of the actual arguments does not meet a precondition stated\n in the documentation of the corresponding formal argument. This\n includes both invalid Action types, and invalid arguments to\n valid Action types."]
    pub const INVALID_ARGUMENT: Error = Error(unsafe { ::std::num::NonZeroU32::new_unchecked(2) });
    #[doc = " Failed to allocate the necessary resources."]
    pub const OUT_OF_RESOURCES: Error = Error(unsafe { ::std::num::NonZeroU32::new_unchecked(3) });
 }
 impl From<libloading::Error> for Error {
    fn from(_: libloading::Error) -> Self {
        Self::UNKNOWN
    }
 }
 impl From<comgr2::amd_comgr_status_s> for Error {
    fn from(status: comgr2::amd_comgr_status_s) -> Self {
        Error(status.0)
    }
 }
 impl From<comgr3::amd_comgr_status_s> for Error {
    fn from(status: comgr3::amd_comgr_status_s) -> Self {
        Error(status.0)
    }
 }
 macro_rules! impl_into {
    ($self_type:ident, $to_type:ident, [$($from:ident => $to:ident),+]) => {
        #[derive(Copy, Clone)]
        #[allow(unused)]
        enum $self_type {
            $(
                $from,
            )+
        }
        impl $self_type {
            fn comgr2(self) -> comgr2::$to_type {
                match self {
                    $(
                        Self:: $from => comgr2 :: $to_type :: $to,
                    )+
                }
            }
            fn comgr3(self) -> comgr3::$to_type {
                match self {
                    $(
                        Self:: $from => comgr3 :: $to_type :: $to,
                    )+
                }
            }
        }
    };
 }
 impl_into!(
    ActionKind,
    amd_comgr_action_kind_t,
    [
        LinkBcToBc => AMD_COMGR_ACTION_LINK_BC_TO_BC,
        CompileSourceToExecutable => AMD_COMGR_ACTION_COMPILE_SOURCE_TO_EXECUTABLE
    ]
 );
 impl_into!(
    DataKind,
    amd_comgr_data_kind_t,
    [
        Undef => AMD_COMGR_DATA_KIND_UNDEF,
        Source => AMD_COMGR_DATA_KIND_SOURCE,
        Include => AMD_COMGR_DATA_KIND_INCLUDE,
        PrecompiledHeader => AMD_COMGR_DATA_KIND_PRECOMPILED_HEADER,
        Diagnostic => AMD_COMGR_DATA_KIND_DIAGNOSTIC,
        Log => AMD_COMGR_DATA_KIND_LOG,
        Bc => AMD_COMGR_DATA_KIND_BC,
        Relocatable => AMD_COMGR_DATA_KIND_RELOCATABLE,
        Executable => AMD_COMGR_DATA_KIND_EXECUTABLE,
        Bytes => AMD_COMGR_DATA_KIND_BYTES,
        Fatbin => AMD_COMGR_DATA_KIND_FATBIN,
        Ar => AMD_COMGR_DATA_KIND_AR,
        BcBundle => AMD_COMGR_DATA_KIND_BC_BUNDLE,
        ArBundle => AMD_COMGR_DATA_KIND_AR_BUNDLE,
        ObjBundle => AMD_COMGR_DATA_KIND_OBJ_BUNDLE
    ]
 );
 impl_into!(
    Language,
    amd_comgr_language_t,
    [
        None => AMD_COMGR_LANGUAGE_NONE,
        OpenCl12 => AMD_COMGR_LANGUAGE_OPENCL_1_2,
        OpenCl20 => AMD_COMGR_LANGUAGE_OPENCL_2_0,
        Hip => AMD_COMGR_LANGUAGE_HIP,
        LlvmIr => AMD_COMGR_LANGUAGE_LLVM_IR
    ]
 );
 #[cfg(unix)]
 mod os {
    pub static COMGR3: &'static str = "libamd_comgr.so.3";
    pub static COMGR2: &'static str = "libamd_comgr.so.2";
 }
 #[cfg(windows)]
 mod os {
    pub static COMGR3: &'static str = "amd_comgr_3.dll";
    pub static COMGR2: &'static str = "amd_comgr_2.dll";
 }
--- a/cuda_base/src/cublas.rs
+++ b/cuda_base/src/cublas.rs
--- a/cuda_base/src/cublaslt.rs
+++ b/cuda_base/src/cublaslt.rs
@ -0,0 +1,583 @@
 // Generated automatically by zluda_bindgen
 // DO NOT EDIT MANUALLY
 #![allow(warnings)]
 extern "system" {
    #[must_use]
    fn cublasLtCreate(
        lightHandle: *mut cuda_types::cublaslt::cublasLtHandle_t,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    fn cublasLtDestroy(
        lightHandle: cuda_types::cublaslt::cublasLtHandle_t,
    ) -> cuda_types::cublas::cublasStatus_t;
    fn cublasLtGetStatusName(
        status: cuda_types::cublas::cublasStatus_t,
    ) -> *const ::core::ffi::c_char;
    fn cublasLtGetStatusString(
        status: cuda_types::cublas::cublasStatus_t,
    ) -> *const ::core::ffi::c_char;
    fn cublasLtGetVersion() -> usize;
    fn cublasLtGetCudartVersion() -> usize;
    #[must_use]
    fn cublasLtGetProperty(
        type_: cuda_types::cublaslt::libraryPropertyType,
        value: *mut ::core::ffi::c_int,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    fn cublasLtHeuristicsCacheGetCapacity(
        capacity: *mut usize,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    fn cublasLtHeuristicsCacheSetCapacity(
        capacity: usize,
    ) -> cuda_types::cublas::cublasStatus_t;
    /** Restricts usage of CPU instructions (ISA) specified by the flags in the mask.
 Flags can be combined with bitwise OR(|) operator. Supported flags:
 - 0x1 -- x86-64 AVX512 ISA
 Default mask: 0 (any applicable ISA is allowed).
 The function returns the previous value of the mask.
 The function takes precedence over the environment variable CUBLASLT_DISABLE_CPU_INSTRUCTIONS_MASK.*/
    fn cublasLtDisableCpuInstructionsSetMask(
        mask: ::core::ffi::c_uint,
    ) -> ::core::ffi::c_uint;
    #[must_use]
    /** Execute matrix multiplication (D = alpha * op(A) * op(B) + beta * C).
 \retval     CUBLAS_STATUS_NOT_INITIALIZED   if cuBLASLt handle has not been initialized
 \retval     CUBLAS_STATUS_INVALID_VALUE     if parameters are in conflict or in an impossible configuration; e.g.
                                             when workspaceSizeInBytes is less than workspace required by configured
                                             algo
 \retval     CUBLAS_STATUS_NOT_SUPPORTED     if current implementation on selected device doesn't support configured
                                             operation
 \retval     CUBLAS_STATUS_ARCH_MISMATCH     if configured operation cannot be run using selected device
 \retval     CUBLAS_STATUS_EXECUTION_FAILED  if cuda reported execution error from the device
 \retval     CUBLAS_STATUS_SUCCESS           if the operation completed successfully*/
    fn cublasLtMatmul(
        lightHandle: cuda_types::cublaslt::cublasLtHandle_t,
        computeDesc: cuda_types::cublaslt::cublasLtMatmulDesc_t,
        alpha: *const ::core::ffi::c_void,
        A: *const ::core::ffi::c_void,
        Adesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
        B: *const ::core::ffi::c_void,
        Bdesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
        beta: *const ::core::ffi::c_void,
        C: *const ::core::ffi::c_void,
        Cdesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
        D: *mut ::core::ffi::c_void,
        Ddesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
        algo: *const cuda_types::cublaslt::cublasLtMatmulAlgo_t,
        workspace: *mut ::core::ffi::c_void,
        workspaceSizeInBytes: usize,
        stream: cuda_types::cublaslt::cudaStream_t,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Matrix layout conversion helper (C = alpha * op(A) + beta * op(B))
 Can be used to change memory order of data or to scale and shift the values.
 \retval     CUBLAS_STATUS_NOT_INITIALIZED   if cuBLASLt handle has not been initialized
 \retval     CUBLAS_STATUS_INVALID_VALUE     if parameters are in conflict or in an impossible configuration; e.g.
                                             when A is not NULL, but Adesc is NULL
 \retval     CUBLAS_STATUS_NOT_SUPPORTED     if current implementation on selected device doesn't support configured
                                             operation
 \retval     CUBLAS_STATUS_ARCH_MISMATCH     if configured operation cannot be run using selected device
 \retval     CUBLAS_STATUS_EXECUTION_FAILED  if cuda reported execution error from the device
 \retval     CUBLAS_STATUS_SUCCESS           if the operation completed successfully*/
    fn cublasLtMatrixTransform(
        lightHandle: cuda_types::cublaslt::cublasLtHandle_t,
        transformDesc: cuda_types::cublaslt::cublasLtMatrixTransformDesc_t,
        alpha: *const ::core::ffi::c_void,
        A: *const ::core::ffi::c_void,
        Adesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
        beta: *const ::core::ffi::c_void,
        B: *const ::core::ffi::c_void,
        Bdesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
        C: *mut ::core::ffi::c_void,
        Cdesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
        stream: cuda_types::cublaslt::cudaStream_t,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /// Internal. Do not use directly.
    fn cublasLtMatrixLayoutInit_internal(
        matLayout: cuda_types::cublaslt::cublasLtMatrixLayout_t,
        size: usize,
        type_: cuda_types::cublaslt::cudaDataType,
        rows: u64,
        cols: u64,
        ld: i64,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Create new matrix layout descriptor.
 \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
 \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully*/
    fn cublasLtMatrixLayoutCreate(
        matLayout: *mut cuda_types::cublaslt::cublasLtMatrixLayout_t,
        type_: cuda_types::cublaslt::cudaDataType,
        rows: u64,
        cols: u64,
        ld: i64,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Destroy matrix layout descriptor.
 \retval     CUBLAS_STATUS_SUCCESS  if operation was successful*/
    fn cublasLtMatrixLayoutDestroy(
        matLayout: cuda_types::cublaslt::cublasLtMatrixLayout_t,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Set matrix layout descriptor attribute.
 \param[in]  matLayout    The descriptor
 \param[in]  attr         The attribute
 \param[in]  buf          memory address containing the new value
 \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
 \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
                                          selected attribute
 \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully*/
    fn cublasLtMatrixLayoutSetAttribute(
        matLayout: cuda_types::cublaslt::cublasLtMatrixLayout_t,
        attr: cuda_types::cublaslt::cublasLtMatrixLayoutAttribute_t,
        buf: *const ::core::ffi::c_void,
        sizeInBytes: usize,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Get matrix layout descriptor attribute.
 \param[in]  matLayout    The descriptor
 \param[in]  attr         The attribute
 \param[out] buf          memory address containing the new value
 \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
 \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
 \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
                                          selected attribute
 \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory*/
    fn cublasLtMatrixLayoutGetAttribute(
        matLayout: cuda_types::cublaslt::cublasLtMatrixLayout_t,
        attr: cuda_types::cublaslt::cublasLtMatrixLayoutAttribute_t,
        buf: *mut ::core::ffi::c_void,
        sizeInBytes: usize,
        sizeWritten: *mut usize,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /// Internal. Do not use directly.
    fn cublasLtMatmulDescInit_internal(
        matmulDesc: cuda_types::cublaslt::cublasLtMatmulDesc_t,
        size: usize,
        computeType: cuda_types::cublaslt::cublasComputeType_t,
        scaleType: cuda_types::cublaslt::cudaDataType_t,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Create new matmul operation descriptor.
 \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
 \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully*/
    fn cublasLtMatmulDescCreate(
        matmulDesc: *mut cuda_types::cublaslt::cublasLtMatmulDesc_t,
        computeType: cuda_types::cublaslt::cublasComputeType_t,
        scaleType: cuda_types::cublaslt::cudaDataType_t,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Destroy matmul operation descriptor.
 \retval     CUBLAS_STATUS_SUCCESS  if operation was successful*/
    fn cublasLtMatmulDescDestroy(
        matmulDesc: cuda_types::cublaslt::cublasLtMatmulDesc_t,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Set matmul operation descriptor attribute.
 \param[in]  matmulDesc   The descriptor
 \param[in]  attr         The attribute
 \param[in]  buf          memory address containing the new value
 \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
 \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
                                          selected attribute
 \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully*/
    fn cublasLtMatmulDescSetAttribute(
        matmulDesc: cuda_types::cublaslt::cublasLtMatmulDesc_t,
        attr: cuda_types::cublaslt::cublasLtMatmulDescAttributes_t,
        buf: *const ::core::ffi::c_void,
        sizeInBytes: usize,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Get matmul operation descriptor attribute.
 \param[in]  matmulDesc   The descriptor
 \param[in]  attr         The attribute
 \param[out] buf          memory address containing the new value
 \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
 \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
 \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
                                          selected attribute
 \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory*/
    fn cublasLtMatmulDescGetAttribute(
        matmulDesc: cuda_types::cublaslt::cublasLtMatmulDesc_t,
        attr: cuda_types::cublaslt::cublasLtMatmulDescAttributes_t,
        buf: *mut ::core::ffi::c_void,
        sizeInBytes: usize,
        sizeWritten: *mut usize,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /// Internal. Do not use directly.
    fn cublasLtMatrixTransformDescInit_internal(
        transformDesc: cuda_types::cublaslt::cublasLtMatrixTransformDesc_t,
        size: usize,
        scaleType: cuda_types::cublaslt::cudaDataType,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Create new matrix transform operation descriptor.
 \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
 \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully*/
    fn cublasLtMatrixTransformDescCreate(
        transformDesc: *mut cuda_types::cublaslt::cublasLtMatrixTransformDesc_t,
        scaleType: cuda_types::cublaslt::cudaDataType,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Destroy matrix transform operation descriptor.
 \retval     CUBLAS_STATUS_SUCCESS  if operation was successful*/
    fn cublasLtMatrixTransformDescDestroy(
        transformDesc: cuda_types::cublaslt::cublasLtMatrixTransformDesc_t,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Set matrix transform operation descriptor attribute.
 \param[in]  transformDesc  The descriptor
 \param[in]  attr           The attribute
 \param[in]  buf            memory address containing the new value
 \param[in]  sizeInBytes    size of buf buffer for verification (in bytes)
 \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
                                          selected attribute
 \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully*/
    fn cublasLtMatrixTransformDescSetAttribute(
        transformDesc: cuda_types::cublaslt::cublasLtMatrixTransformDesc_t,
        attr: cuda_types::cublaslt::cublasLtMatrixTransformDescAttributes_t,
        buf: *const ::core::ffi::c_void,
        sizeInBytes: usize,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Get matrix transform operation descriptor attribute.
 \param[in]  transformDesc  The descriptor
 \param[in]  attr           The attribute
 \param[out] buf            memory address containing the new value
 \param[in]  sizeInBytes    size of buf buffer for verification (in bytes)
 \param[out] sizeWritten    only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number
 of bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
 \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
                                          selected attribute
 \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory*/
    fn cublasLtMatrixTransformDescGetAttribute(
        transformDesc: cuda_types::cublaslt::cublasLtMatrixTransformDesc_t,
        attr: cuda_types::cublaslt::cublasLtMatrixTransformDescAttributes_t,
        buf: *mut ::core::ffi::c_void,
        sizeInBytes: usize,
        sizeWritten: *mut usize,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /// Internal. Do not use directly.
    fn cublasLtMatmulPreferenceInit_internal(
        pref: cuda_types::cublaslt::cublasLtMatmulPreference_t,
        size: usize,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Create new matmul heuristic search preference descriptor.
 \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
 \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully*/
    fn cublasLtMatmulPreferenceCreate(
        pref: *mut cuda_types::cublaslt::cublasLtMatmulPreference_t,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Destroy matmul heuristic search preference descriptor.
 \retval     CUBLAS_STATUS_SUCCESS  if operation was successful*/
    fn cublasLtMatmulPreferenceDestroy(
        pref: cuda_types::cublaslt::cublasLtMatmulPreference_t,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Set matmul heuristic search preference descriptor attribute.
 \param[in]  pref         The descriptor
 \param[in]  attr         The attribute
 \param[in]  buf          memory address containing the new value
 \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
 \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
                                          selected attribute
 \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully*/
    fn cublasLtMatmulPreferenceSetAttribute(
        pref: cuda_types::cublaslt::cublasLtMatmulPreference_t,
        attr: cuda_types::cublaslt::cublasLtMatmulPreferenceAttributes_t,
        buf: *const ::core::ffi::c_void,
        sizeInBytes: usize,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Get matmul heuristic search preference descriptor attribute.
 \param[in]  pref         The descriptor
 \param[in]  attr         The attribute
 \param[out] buf          memory address containing the new value
 \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
 \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
 \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
                                          selected attribute
 \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory*/
    fn cublasLtMatmulPreferenceGetAttribute(
        pref: cuda_types::cublaslt::cublasLtMatmulPreference_t,
        attr: cuda_types::cublaslt::cublasLtMatmulPreferenceAttributes_t,
        buf: *mut ::core::ffi::c_void,
        sizeInBytes: usize,
        sizeWritten: *mut usize,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Query cublasLt heuristic for algorithm appropriate for given use case.
 \param[in]      lightHandle            Pointer to the allocated cuBLASLt handle for the cuBLASLt
                                        context. See cublasLtHandle_t.
 \param[in]      operationDesc          Handle to the matrix multiplication descriptor.
 \param[in]      Adesc                  Handle to the layout descriptors for matrix A.
 \param[in]      Bdesc                  Handle to the layout descriptors for matrix B.
 \param[in]      Cdesc                  Handle to the layout descriptors for matrix C.
 \param[in]      Ddesc                  Handle to the layout descriptors for matrix D.
 \param[in]      preference             Pointer to the structure holding the heuristic search
                                        preferences descriptor. See cublasLtMatrixLayout_t.
 \param[in]      requestedAlgoCount     Size of heuristicResultsArray (in elements) and requested
                                        maximum number of algorithms to return.
 \param[in, out] heuristicResultsArray  Output algorithms and associated runtime characteristics,
                                        ordered in increasing estimated compute time.
 \param[out]     returnAlgoCount        The number of heuristicResultsArray elements written.
 \retval  CUBLAS_STATUS_INVALID_VALUE   if requestedAlgoCount is less or equal to zero
 \retval  CUBLAS_STATUS_NOT_SUPPORTED   if no heuristic function available for current configuration
 \retval  CUBLAS_STATUS_SUCCESS         if query was successful, inspect
                                        heuristicResultsArray[0 to (returnAlgoCount - 1)].state
                                        for detail status of results*/
    fn cublasLtMatmulAlgoGetHeuristic(
        lightHandle: cuda_types::cublaslt::cublasLtHandle_t,
        operationDesc: cuda_types::cublaslt::cublasLtMatmulDesc_t,
        Adesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
        Bdesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
        Cdesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
        Ddesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
        preference: cuda_types::cublaslt::cublasLtMatmulPreference_t,
        requestedAlgoCount: ::core::ffi::c_int,
        heuristicResultsArray: *mut cuda_types::cublaslt::cublasLtMatmulHeuristicResult_t,
        returnAlgoCount: *mut ::core::ffi::c_int,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Routine to get all algo IDs that can potentially run
 \param[in]  int              requestedAlgoCount requested number of algos (must be less or equal to size of algoIdsA
 (in elements)) \param[out] algoIdsA         array to write algoIds to \param[out] returnAlgoCount  number of algoIds
 actually written
 \retval     CUBLAS_STATUS_INVALID_VALUE  if requestedAlgoCount is less or equal to zero
 \retval     CUBLAS_STATUS_SUCCESS        if query was successful, inspect returnAlgoCount to get actual number of IDs
                                          available*/
    fn cublasLtMatmulAlgoGetIds(
        lightHandle: cuda_types::cublaslt::cublasLtHandle_t,
        computeType: cuda_types::cublaslt::cublasComputeType_t,
        scaleType: cuda_types::cublaslt::cudaDataType_t,
        Atype: cuda_types::cublaslt::cudaDataType_t,
        Btype: cuda_types::cublaslt::cudaDataType_t,
        Ctype: cuda_types::cublaslt::cudaDataType_t,
        Dtype: cuda_types::cublaslt::cudaDataType_t,
        requestedAlgoCount: ::core::ffi::c_int,
        algoIdsArray: *mut ::core::ffi::c_int,
        returnAlgoCount: *mut ::core::ffi::c_int,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Initialize algo structure
 \retval     CUBLAS_STATUS_INVALID_VALUE  if algo is NULL or algoId is outside of recognized range
 \retval     CUBLAS_STATUS_NOT_SUPPORTED  if algoId is not supported for given combination of data types
 \retval     CUBLAS_STATUS_SUCCESS        if the structure was successfully initialized*/
    fn cublasLtMatmulAlgoInit(
        lightHandle: cuda_types::cublaslt::cublasLtHandle_t,
        computeType: cuda_types::cublaslt::cublasComputeType_t,
        scaleType: cuda_types::cublaslt::cudaDataType_t,
        Atype: cuda_types::cublaslt::cudaDataType_t,
        Btype: cuda_types::cublaslt::cudaDataType_t,
        Ctype: cuda_types::cublaslt::cudaDataType_t,
        Dtype: cuda_types::cublaslt::cudaDataType_t,
        algoId: ::core::ffi::c_int,
        algo: *mut cuda_types::cublaslt::cublasLtMatmulAlgo_t,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Check configured algo descriptor for correctness and support on current device.
 Result includes required workspace size and calculated wave count.
 CUBLAS_STATUS_SUCCESS doesn't fully guarantee algo will run (will fail if e.g. buffers are not correctly aligned);
 but if cublasLtMatmulAlgoCheck fails, the algo will not run.
 \param[in]  algo    algo configuration to check
 \param[out] result  result structure to report algo runtime characteristics; algo field is never updated
 \retval     CUBLAS_STATUS_INVALID_VALUE  if matrix layout descriptors or operation descriptor don't match algo
                                          descriptor
 \retval     CUBLAS_STATUS_NOT_SUPPORTED  if algo configuration or data type combination is not currently supported on
                                          given device
 \retval     CUBLAS_STATUS_ARCH_MISMATCH  if algo configuration cannot be run using the selected device
 \retval     CUBLAS_STATUS_SUCCESS        if check was successful*/
    fn cublasLtMatmulAlgoCheck(
        lightHandle: cuda_types::cublaslt::cublasLtHandle_t,
        operationDesc: cuda_types::cublaslt::cublasLtMatmulDesc_t,
        Adesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
        Bdesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
        Cdesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
        Ddesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
        algo: *const cuda_types::cublaslt::cublasLtMatmulAlgo_t,
        result: *mut cuda_types::cublaslt::cublasLtMatmulHeuristicResult_t,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Get algo capability attribute.
 E.g. to get list of supported Tile IDs:
      cublasLtMatmulTile_t tiles[CUBLASLT_MATMUL_TILE_END];
      size_t num_tiles, size_written;
      if (cublasLtMatmulAlgoCapGetAttribute(algo, CUBLASLT_ALGO_CAP_TILE_IDS, tiles, sizeof(tiles), size_written) ==
 CUBLAS_STATUS_SUCCESS) { num_tiles = size_written / sizeof(tiles[0]);
      }
 \param[in]  algo         The algo descriptor
 \param[in]  attr         The attribute
 \param[out] buf          memory address containing the new value
 \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
 \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
 \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
                                          selected attribute
 \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory*/
    fn cublasLtMatmulAlgoCapGetAttribute(
        algo: *const cuda_types::cublaslt::cublasLtMatmulAlgo_t,
        attr: cuda_types::cublaslt::cublasLtMatmulAlgoCapAttributes_t,
        buf: *mut ::core::ffi::c_void,
        sizeInBytes: usize,
        sizeWritten: *mut usize,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Set algo configuration attribute.
 \param[in]  algo         The algo descriptor
 \param[in]  attr         The attribute
 \param[in]  buf          memory address containing the new value
 \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
 \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
                                          selected attribute
 \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully*/
    fn cublasLtMatmulAlgoConfigSetAttribute(
        algo: *mut cuda_types::cublaslt::cublasLtMatmulAlgo_t,
        attr: cuda_types::cublaslt::cublasLtMatmulAlgoConfigAttributes_t,
        buf: *const ::core::ffi::c_void,
        sizeInBytes: usize,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Get algo configuration attribute.
 \param[in]  algo         The algo descriptor
 \param[in]  attr         The attribute
 \param[out] buf          memory address containing the new value
 \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
 \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
 \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
                                          selected attribute
 \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory*/
    fn cublasLtMatmulAlgoConfigGetAttribute(
        algo: *const cuda_types::cublaslt::cublasLtMatmulAlgo_t,
        attr: cuda_types::cublaslt::cublasLtMatmulAlgoConfigAttributes_t,
        buf: *mut ::core::ffi::c_void,
        sizeInBytes: usize,
        sizeWritten: *mut usize,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Experimental: Logger callback setter.
 \param[in]  callback                     a user defined callback function to be called by the logger
 \retval     CUBLAS_STATUS_SUCCESS        if callback was set successfully*/
    fn cublasLtLoggerSetCallback(
        callback: cuda_types::cublaslt::cublasLtLoggerCallback_t,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Experimental: Log file setter.
 \param[in]  file                         an open file with write permissions
 \retval     CUBLAS_STATUS_SUCCESS        if log file was set successfully*/
    fn cublasLtLoggerSetFile(
        file: *mut cuda_types::FILE,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Experimental: Open log file.
 \param[in]  logFile                      log file path. if the log file does not exist, it will be created
 \retval     CUBLAS_STATUS_SUCCESS        if log file was created successfully*/
    fn cublasLtLoggerOpenFile(
        logFile: *const ::core::ffi::c_char,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Experimental: Log level setter.
 \param[in]  level                        log level, should be one of the following:
                                          0. Off
                                          1. Errors
                                          2. Performance Trace
                                          3. Performance Hints
                                          4. Heuristics Trace
                                          5. API Trace
 \retval     CUBLAS_STATUS_INVALID_VALUE  if log level is not one of the above levels
 \retval     CUBLAS_STATUS_SUCCESS        if log level was set successfully*/
    fn cublasLtLoggerSetLevel(
        level: ::core::ffi::c_int,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Experimental: Log mask setter.
 \param[in]  mask                         log mask, should be a combination of the following masks:
                                          0.  Off
                                          1.  Errors
                                          2.  Performance Trace
                                          4.  Performance Hints
                                          8.  Heuristics Trace
                                          16. API Trace
 \retval     CUBLAS_STATUS_SUCCESS        if log mask was set successfully*/
    fn cublasLtLoggerSetMask(
        mask: ::core::ffi::c_int,
    ) -> cuda_types::cublas::cublasStatus_t;
    #[must_use]
    /** Experimental: Disable logging for the entire session.
 \retval     CUBLAS_STATUS_SUCCESS        if disabled logging*/
    fn cublasLtLoggerForceDisable() -> cuda_types::cublas::cublasStatus_t;
 }
--- a/cuda_base/src/cublaslt_internal.rs
+++ b/cuda_base/src/cublaslt_internal.rs
--- a/cuda_base/src/cuda.rs
+++ b/cuda_base/src/cuda.rs
--- a/cuda_base/src/cudnn8.rs
+++ b/cuda_base/src/cudnn8.rs
--- a/cuda_base/src/cudnn9.rs
+++ b/cuda_base/src/cudnn9.rs
--- a/cuda_base/src/cufft.rs
+++ b/cuda_base/src/cufft.rs
@ -0,0 +1,368 @@
 // Generated automatically by zluda_bindgen
 // DO NOT EDIT MANUALLY
 #![allow(warnings)]
 extern "system" {
    fn cufftPlan1d(
        plan: *mut cuda_types::cufft::cufftHandle,
        nx: ::core::ffi::c_int,
        type_: cuda_types::cufft::cufftType,
        batch: ::core::ffi::c_int,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftPlan2d(
        plan: *mut cuda_types::cufft::cufftHandle,
        nx: ::core::ffi::c_int,
        ny: ::core::ffi::c_int,
        type_: cuda_types::cufft::cufftType,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftPlan3d(
        plan: *mut cuda_types::cufft::cufftHandle,
        nx: ::core::ffi::c_int,
        ny: ::core::ffi::c_int,
        nz: ::core::ffi::c_int,
        type_: cuda_types::cufft::cufftType,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftPlanMany(
        plan: *mut cuda_types::cufft::cufftHandle,
        rank: ::core::ffi::c_int,
        n: *mut ::core::ffi::c_int,
        inembed: *mut ::core::ffi::c_int,
        istride: ::core::ffi::c_int,
        idist: ::core::ffi::c_int,
        onembed: *mut ::core::ffi::c_int,
        ostride: ::core::ffi::c_int,
        odist: ::core::ffi::c_int,
        type_: cuda_types::cufft::cufftType,
        batch: ::core::ffi::c_int,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftMakePlan1d(
        plan: cuda_types::cufft::cufftHandle,
        nx: ::core::ffi::c_int,
        type_: cuda_types::cufft::cufftType,
        batch: ::core::ffi::c_int,
        workSize: *mut usize,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftMakePlan2d(
        plan: cuda_types::cufft::cufftHandle,
        nx: ::core::ffi::c_int,
        ny: ::core::ffi::c_int,
        type_: cuda_types::cufft::cufftType,
        workSize: *mut usize,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftMakePlan3d(
        plan: cuda_types::cufft::cufftHandle,
        nx: ::core::ffi::c_int,
        ny: ::core::ffi::c_int,
        nz: ::core::ffi::c_int,
        type_: cuda_types::cufft::cufftType,
        workSize: *mut usize,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftMakePlanMany(
        plan: cuda_types::cufft::cufftHandle,
        rank: ::core::ffi::c_int,
        n: *mut ::core::ffi::c_int,
        inembed: *mut ::core::ffi::c_int,
        istride: ::core::ffi::c_int,
        idist: ::core::ffi::c_int,
        onembed: *mut ::core::ffi::c_int,
        ostride: ::core::ffi::c_int,
        odist: ::core::ffi::c_int,
        type_: cuda_types::cufft::cufftType,
        batch: ::core::ffi::c_int,
        workSize: *mut usize,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftMakePlanMany64(
        plan: cuda_types::cufft::cufftHandle,
        rank: ::core::ffi::c_int,
        n: *mut ::core::ffi::c_longlong,
        inembed: *mut ::core::ffi::c_longlong,
        istride: ::core::ffi::c_longlong,
        idist: ::core::ffi::c_longlong,
        onembed: *mut ::core::ffi::c_longlong,
        ostride: ::core::ffi::c_longlong,
        odist: ::core::ffi::c_longlong,
        type_: cuda_types::cufft::cufftType,
        batch: ::core::ffi::c_longlong,
        workSize: *mut usize,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftGetSizeMany64(
        plan: cuda_types::cufft::cufftHandle,
        rank: ::core::ffi::c_int,
        n: *mut ::core::ffi::c_longlong,
        inembed: *mut ::core::ffi::c_longlong,
        istride: ::core::ffi::c_longlong,
        idist: ::core::ffi::c_longlong,
        onembed: *mut ::core::ffi::c_longlong,
        ostride: ::core::ffi::c_longlong,
        odist: ::core::ffi::c_longlong,
        type_: cuda_types::cufft::cufftType,
        batch: ::core::ffi::c_longlong,
        workSize: *mut usize,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftEstimate1d(
        nx: ::core::ffi::c_int,
        type_: cuda_types::cufft::cufftType,
        batch: ::core::ffi::c_int,
        workSize: *mut usize,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftEstimate2d(
        nx: ::core::ffi::c_int,
        ny: ::core::ffi::c_int,
        type_: cuda_types::cufft::cufftType,
        workSize: *mut usize,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftEstimate3d(
        nx: ::core::ffi::c_int,
        ny: ::core::ffi::c_int,
        nz: ::core::ffi::c_int,
        type_: cuda_types::cufft::cufftType,
        workSize: *mut usize,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftEstimateMany(
        rank: ::core::ffi::c_int,
        n: *mut ::core::ffi::c_int,
        inembed: *mut ::core::ffi::c_int,
        istride: ::core::ffi::c_int,
        idist: ::core::ffi::c_int,
        onembed: *mut ::core::ffi::c_int,
        ostride: ::core::ffi::c_int,
        odist: ::core::ffi::c_int,
        type_: cuda_types::cufft::cufftType,
        batch: ::core::ffi::c_int,
        workSize: *mut usize,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftCreate(
        handle: *mut cuda_types::cufft::cufftHandle,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftGetSize1d(
        handle: cuda_types::cufft::cufftHandle,
        nx: ::core::ffi::c_int,
        type_: cuda_types::cufft::cufftType,
        batch: ::core::ffi::c_int,
        workSize: *mut usize,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftGetSize2d(
        handle: cuda_types::cufft::cufftHandle,
        nx: ::core::ffi::c_int,
        ny: ::core::ffi::c_int,
        type_: cuda_types::cufft::cufftType,
        workSize: *mut usize,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftGetSize3d(
        handle: cuda_types::cufft::cufftHandle,
        nx: ::core::ffi::c_int,
        ny: ::core::ffi::c_int,
        nz: ::core::ffi::c_int,
        type_: cuda_types::cufft::cufftType,
        workSize: *mut usize,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftGetSizeMany(
        handle: cuda_types::cufft::cufftHandle,
        rank: ::core::ffi::c_int,
        n: *mut ::core::ffi::c_int,
        inembed: *mut ::core::ffi::c_int,
        istride: ::core::ffi::c_int,
        idist: ::core::ffi::c_int,
        onembed: *mut ::core::ffi::c_int,
        ostride: ::core::ffi::c_int,
        odist: ::core::ffi::c_int,
        type_: cuda_types::cufft::cufftType,
        batch: ::core::ffi::c_int,
        workArea: *mut usize,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftGetSize(
        handle: cuda_types::cufft::cufftHandle,
        workSize: *mut usize,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftSetWorkArea(
        plan: cuda_types::cufft::cufftHandle,
        workArea: *mut ::core::ffi::c_void,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftSetAutoAllocation(
        plan: cuda_types::cufft::cufftHandle,
        autoAllocate: ::core::ffi::c_int,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftExecC2C(
        plan: cuda_types::cufft::cufftHandle,
        idata: *mut cuda_types::cufft::cufftComplex,
        odata: *mut cuda_types::cufft::cufftComplex,
        direction: ::core::ffi::c_int,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftExecR2C(
        plan: cuda_types::cufft::cufftHandle,
        idata: *mut cuda_types::cufft::cufftReal,
        odata: *mut cuda_types::cufft::cufftComplex,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftExecC2R(
        plan: cuda_types::cufft::cufftHandle,
        idata: *mut cuda_types::cufft::cufftComplex,
        odata: *mut cuda_types::cufft::cufftReal,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftExecZ2Z(
        plan: cuda_types::cufft::cufftHandle,
        idata: *mut cuda_types::cufft::cufftDoubleComplex,
        odata: *mut cuda_types::cufft::cufftDoubleComplex,
        direction: ::core::ffi::c_int,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftExecD2Z(
        plan: cuda_types::cufft::cufftHandle,
        idata: *mut cuda_types::cufft::cufftDoubleReal,
        odata: *mut cuda_types::cufft::cufftDoubleComplex,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftExecZ2D(
        plan: cuda_types::cufft::cufftHandle,
        idata: *mut cuda_types::cufft::cufftDoubleComplex,
        odata: *mut cuda_types::cufft::cufftDoubleReal,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftSetStream(
        plan: cuda_types::cufft::cufftHandle,
        stream: cuda_types::cufft::cudaStream_t,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftDestroy(
        plan: cuda_types::cufft::cufftHandle,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftGetVersion(
        version: *mut ::core::ffi::c_int,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftGetProperty(
        type_: cuda_types::cufft::libraryPropertyType,
        value: *mut ::core::ffi::c_int,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftSetPlanPropertyInt64(
        plan: cuda_types::cufft::cufftHandle,
        property: cuda_types::cufft::cufftProperty,
        inputValueInt: ::core::ffi::c_longlong,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftGetPlanPropertyInt64(
        plan: cuda_types::cufft::cufftHandle,
        property: cuda_types::cufft::cufftProperty,
        returnPtrValue: *mut ::core::ffi::c_longlong,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftResetPlanProperty(
        plan: cuda_types::cufft::cufftHandle,
        property: cuda_types::cufft::cufftProperty,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftXtSetGPUs(
        handle: cuda_types::cufft::cufftHandle,
        nGPUs: ::core::ffi::c_int,
        whichGPUs: *mut ::core::ffi::c_int,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftXtMalloc(
        plan: cuda_types::cufft::cufftHandle,
        descriptor: *mut *mut cuda_types::cufft::cudaLibXtDesc,
        format: cuda_types::cufft::cufftXtSubFormat,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftXtMemcpy(
        plan: cuda_types::cufft::cufftHandle,
        dstPointer: *mut ::core::ffi::c_void,
        srcPointer: *mut ::core::ffi::c_void,
        type_: cuda_types::cufft::cufftXtCopyType,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftXtFree(
        descriptor: *mut cuda_types::cufft::cudaLibXtDesc,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftXtSetWorkArea(
        plan: cuda_types::cufft::cufftHandle,
        workArea: *mut *mut ::core::ffi::c_void,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftXtExecDescriptorC2C(
        plan: cuda_types::cufft::cufftHandle,
        input: *mut cuda_types::cufft::cudaLibXtDesc,
        output: *mut cuda_types::cufft::cudaLibXtDesc,
        direction: ::core::ffi::c_int,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftXtExecDescriptorR2C(
        plan: cuda_types::cufft::cufftHandle,
        input: *mut cuda_types::cufft::cudaLibXtDesc,
        output: *mut cuda_types::cufft::cudaLibXtDesc,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftXtExecDescriptorC2R(
        plan: cuda_types::cufft::cufftHandle,
        input: *mut cuda_types::cufft::cudaLibXtDesc,
        output: *mut cuda_types::cufft::cudaLibXtDesc,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftXtExecDescriptorZ2Z(
        plan: cuda_types::cufft::cufftHandle,
        input: *mut cuda_types::cufft::cudaLibXtDesc,
        output: *mut cuda_types::cufft::cudaLibXtDesc,
        direction: ::core::ffi::c_int,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftXtExecDescriptorD2Z(
        plan: cuda_types::cufft::cufftHandle,
        input: *mut cuda_types::cufft::cudaLibXtDesc,
        output: *mut cuda_types::cufft::cudaLibXtDesc,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftXtExecDescriptorZ2D(
        plan: cuda_types::cufft::cufftHandle,
        input: *mut cuda_types::cufft::cudaLibXtDesc,
        output: *mut cuda_types::cufft::cudaLibXtDesc,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftXtQueryPlan(
        plan: cuda_types::cufft::cufftHandle,
        queryStruct: *mut ::core::ffi::c_void,
        queryType: cuda_types::cufft::cufftXtQueryType,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftXtSetCallback(
        plan: cuda_types::cufft::cufftHandle,
        callback_routine: *mut *mut ::core::ffi::c_void,
        cbType: cuda_types::cufft::cufftXtCallbackType,
        caller_info: *mut *mut ::core::ffi::c_void,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftXtClearCallback(
        plan: cuda_types::cufft::cufftHandle,
        cbType: cuda_types::cufft::cufftXtCallbackType,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftXtSetCallbackSharedSize(
        plan: cuda_types::cufft::cufftHandle,
        cbType: cuda_types::cufft::cufftXtCallbackType,
        sharedSize: usize,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftXtMakePlanMany(
        plan: cuda_types::cufft::cufftHandle,
        rank: ::core::ffi::c_int,
        n: *mut ::core::ffi::c_longlong,
        inembed: *mut ::core::ffi::c_longlong,
        istride: ::core::ffi::c_longlong,
        idist: ::core::ffi::c_longlong,
        inputtype: cuda_types::cufft::cudaDataType,
        onembed: *mut ::core::ffi::c_longlong,
        ostride: ::core::ffi::c_longlong,
        odist: ::core::ffi::c_longlong,
        outputtype: cuda_types::cufft::cudaDataType,
        batch: ::core::ffi::c_longlong,
        workSize: *mut usize,
        executiontype: cuda_types::cufft::cudaDataType,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftXtGetSizeMany(
        plan: cuda_types::cufft::cufftHandle,
        rank: ::core::ffi::c_int,
        n: *mut ::core::ffi::c_longlong,
        inembed: *mut ::core::ffi::c_longlong,
        istride: ::core::ffi::c_longlong,
        idist: ::core::ffi::c_longlong,
        inputtype: cuda_types::cufft::cudaDataType,
        onembed: *mut ::core::ffi::c_longlong,
        ostride: ::core::ffi::c_longlong,
        odist: ::core::ffi::c_longlong,
        outputtype: cuda_types::cufft::cudaDataType,
        batch: ::core::ffi::c_longlong,
        workSize: *mut usize,
        executiontype: cuda_types::cufft::cudaDataType,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftXtExec(
        plan: cuda_types::cufft::cufftHandle,
        input: *mut ::core::ffi::c_void,
        output: *mut ::core::ffi::c_void,
        direction: ::core::ffi::c_int,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftXtExecDescriptor(
        plan: cuda_types::cufft::cufftHandle,
        input: *mut cuda_types::cufft::cudaLibXtDesc,
        output: *mut cuda_types::cufft::cudaLibXtDesc,
        direction: ::core::ffi::c_int,
    ) -> cuda_types::cufft::cufftResult;
    fn cufftXtSetWorkAreaPolicy(
        plan: cuda_types::cufft::cufftHandle,
        policy: cuda_types::cufft::cufftXtWorkAreaPolicy,
        workSize: *mut usize,
    ) -> cuda_types::cufft::cufftResult;
 }
--- a/cuda_base/src/cusparse.rs
+++ b/cuda_base/src/cusparse.rs
--- a/cuda_base/src/lib.rs
+++ b/cuda_base/src/lib.rs
@ -15,6 +15,12 @@ use syn::{
 const CUDA_RS: &'static str = include_str! {"cuda.rs"};
 const NVML_RS: &'static str = include_str! {"nvml.rs"};
 const CUBLAS_RS: &'static str = include_str! {"cublas.rs"};
 const CUBLASLT_RS: &'static str = include_str! {"cublaslt.rs"};
 const CUBLASLT_INTERNAL_RS: &'static str = include_str! {"cublaslt_internal.rs"};
 const CUFFT_RS: &'static str = include_str! {"cufft.rs"};
 const CUSPARSE_RS: &'static str = include_str! {"cusparse.rs"};
 const CUDNN9_RS: &'static str = include_str! {"cudnn9.rs"};
 // This macro accepts following arguments:
 // * `normal_macro`: ident for a normal macro
@ -35,28 +41,60 @@ pub fn cuda_function_declarations(tokens: TokenStream) -> TokenStream {
    function_declarations(tokens, CUDA_RS)
 }
 #[proc_macro]
 pub fn cublas_function_declarations(tokens: TokenStream) -> TokenStream {
    function_declarations(tokens, CUBLAS_RS)
 }
 #[proc_macro]
 pub fn cublaslt_function_declarations(tokens: TokenStream) -> TokenStream {
    function_declarations(tokens, CUBLASLT_RS)
 }
 #[proc_macro]
 pub fn cublaslt_internal_function_declarations(tokens: TokenStream) -> TokenStream {
    function_declarations(tokens, CUBLASLT_INTERNAL_RS)
 }
 #[proc_macro]
 pub fn cufft_function_declarations(tokens: TokenStream) -> TokenStream {
    function_declarations(tokens, CUFFT_RS)
 }
 #[proc_macro]
 pub fn cusparse_function_declarations(tokens: TokenStream) -> TokenStream {
    function_declarations(tokens, CUSPARSE_RS)
 }
 #[proc_macro]
 pub fn cudnn9_function_declarations(tokens: TokenStream) -> TokenStream {
    function_declarations(tokens, CUDNN9_RS)
 }
 fn function_declarations(tokens: TokenStream, module: &str) -> TokenStream {
    let input = parse_macro_input!(tokens as FnDeclInput);
    let mut cuda_module = syn::parse_str::<File>(module).unwrap();
    let mut choose_macro = ChooseMacro::new(input);
    syn::visit_mut::visit_file_mut(&mut FixFnSignatures, &mut cuda_module);
-    let extern_ = if let Item::ForeignMod(extern_) = cuda_module.items.pop().unwrap() {
+    for item in cuda_module.items {
-        extern_
+        let extern_ = if let Item::ForeignMod(extern_) = item {
-    } else {
+            extern_
        unreachable!()
    };
    let abi = extern_.abi.name;
    for mut item in extern_.items {
        if let ForeignItem::Fn(ForeignItemFn {
            sig: Signature { ref ident, .. },
            ref mut attrs,
            ..
        }) = item
        {
            *attrs = Vec::new();
            choose_macro.add(ident, quote! { #abi #item });
        } else {
            unreachable!()
        };
        let abi = extern_.abi.name;
        for mut item in extern_.items {
            if let ForeignItem::Fn(ForeignItemFn {
                sig: Signature { ref ident, .. },
                ref mut attrs,
                ..
            }) = item
            {
                *attrs = Vec::new();
                choose_macro.add(ident, quote! { #abi #item });
            } else {
                unreachable!()
            }
        }
    }
    let mut result = proc_macro2::TokenStream::new();
@ -161,7 +199,8 @@ impl VisitMut for FixFnSignatures {
 }
 const MODULES: &[&str] = &[
-    "context", "device", "driver", "function", "link", "memory", "module", "pointer",
+    "context", "device", "driver", "function", "library", "link", "memory", "module", "pointer",
    "stream",
 ];
 #[proc_macro]
--- a/cuda_base/src/nvml.rs
+++ b/cuda_base/src/nvml.rs
--- a/cuda_types/Cargo.toml
+++ b/cuda_types/Cargo.toml
@ -2,8 +2,9 @@
 name = "cuda_types"
 version = "0.0.0"
 authors = ["Andrzej Janik <vosen@vosen.pl>"]
-edition = "2018"
+edition = "2021"
 [dependencies]
 cuda_base = { path = "../cuda_base" }
 hip_runtime-sys = { path = "../ext/hip_runtime-sys" }
 bitflags = "2.9.1"
--- a/cuda_types/src/cublas.rs
+++ b/cuda_types/src/cublas.rs
@ -0,0 +1,324 @@
 // Generated automatically by zluda_bindgen
 // DO NOT EDIT MANUALLY
 #![allow(warnings)]
 pub type __half = u16;
 pub type __nv_bfloat16 = u16;
 pub use super::cuda::cuComplex;
 pub use super::cuda::cuDoubleComplex;
 pub use super::cuda::cudaDataType;
 pub use super::cuda::cudaDataType_t;
 pub type cudaStream_t = super::cuda::CUstream;
 pub use super::cuda::libraryPropertyType;
 pub type cudaGraphExecUpdateResultInfo_st = super::cuda::CUgraphExecUpdateResultInfo_st;
 pub type cudaAsyncNotificationType = super::cuda::CUasyncNotificationType_enum;
 pub type cudaGraph_t = super::cuda::CUgraph;
 pub const CUBLAS_VER_MAJOR: u32 = 12;
 pub const CUBLAS_VER_MINOR: u32 = 8;
 pub const CUBLAS_VER_PATCH: u32 = 4;
 pub const CUBLAS_VER_BUILD: u32 = 1;
 pub const CUBLAS_VERSION: u32 = 120804;
 impl cublasStatus_t {
    pub const CUBLAS_STATUS_SUCCESS: cublasStatus_t = cublasStatus_t(0);
 }
 impl cublasStatus_t {
    pub const CUBLAS_STATUS_NOT_INITIALIZED: cublasStatus_t = cublasStatus_t(1);
 }
 impl cublasStatus_t {
    pub const CUBLAS_STATUS_ALLOC_FAILED: cublasStatus_t = cublasStatus_t(3);
 }
 impl cublasStatus_t {
    pub const CUBLAS_STATUS_INVALID_VALUE: cublasStatus_t = cublasStatus_t(7);
 }
 impl cublasStatus_t {
    pub const CUBLAS_STATUS_ARCH_MISMATCH: cublasStatus_t = cublasStatus_t(8);
 }
 impl cublasStatus_t {
    pub const CUBLAS_STATUS_MAPPING_ERROR: cublasStatus_t = cublasStatus_t(11);
 }
 impl cublasStatus_t {
    pub const CUBLAS_STATUS_EXECUTION_FAILED: cublasStatus_t = cublasStatus_t(13);
 }
 impl cublasStatus_t {
    pub const CUBLAS_STATUS_INTERNAL_ERROR: cublasStatus_t = cublasStatus_t(14);
 }
 impl cublasStatus_t {
    pub const CUBLAS_STATUS_NOT_SUPPORTED: cublasStatus_t = cublasStatus_t(15);
 }
 impl cublasStatus_t {
    pub const CUBLAS_STATUS_LICENSE_ERROR: cublasStatus_t = cublasStatus_t(16);
 }
 #[repr(transparent)]
 #[must_use]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cublasStatus_t(pub ::core::ffi::c_uint);
 impl cublasFillMode_t {
    pub const CUBLAS_FILL_MODE_LOWER: cublasFillMode_t = cublasFillMode_t(0);
 }
 impl cublasFillMode_t {
    pub const CUBLAS_FILL_MODE_UPPER: cublasFillMode_t = cublasFillMode_t(1);
 }
 impl cublasFillMode_t {
    pub const CUBLAS_FILL_MODE_FULL: cublasFillMode_t = cublasFillMode_t(2);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cublasFillMode_t(pub ::core::ffi::c_uint);
 impl cublasDiagType_t {
    pub const CUBLAS_DIAG_NON_UNIT: cublasDiagType_t = cublasDiagType_t(0);
 }
 impl cublasDiagType_t {
    pub const CUBLAS_DIAG_UNIT: cublasDiagType_t = cublasDiagType_t(1);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cublasDiagType_t(pub ::core::ffi::c_uint);
 impl cublasSideMode_t {
    pub const CUBLAS_SIDE_LEFT: cublasSideMode_t = cublasSideMode_t(0);
 }
 impl cublasSideMode_t {
    pub const CUBLAS_SIDE_RIGHT: cublasSideMode_t = cublasSideMode_t(1);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cublasSideMode_t(pub ::core::ffi::c_uint);
 impl cublasOperation_t {
    pub const CUBLAS_OP_N: cublasOperation_t = cublasOperation_t(0);
 }
 impl cublasOperation_t {
    pub const CUBLAS_OP_T: cublasOperation_t = cublasOperation_t(1);
 }
 impl cublasOperation_t {
    pub const CUBLAS_OP_C: cublasOperation_t = cublasOperation_t(2);
 }
 impl cublasOperation_t {
    pub const CUBLAS_OP_HERMITAN: cublasOperation_t = cublasOperation_t(2);
 }
 impl cublasOperation_t {
    pub const CUBLAS_OP_CONJG: cublasOperation_t = cublasOperation_t(3);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cublasOperation_t(pub ::core::ffi::c_uint);
 impl cublasPointerMode_t {
    pub const CUBLAS_POINTER_MODE_HOST: cublasPointerMode_t = cublasPointerMode_t(0);
 }
 impl cublasPointerMode_t {
    pub const CUBLAS_POINTER_MODE_DEVICE: cublasPointerMode_t = cublasPointerMode_t(1);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cublasPointerMode_t(pub ::core::ffi::c_uint);
 impl cublasAtomicsMode_t {
    pub const CUBLAS_ATOMICS_NOT_ALLOWED: cublasAtomicsMode_t = cublasAtomicsMode_t(0);
 }
 impl cublasAtomicsMode_t {
    pub const CUBLAS_ATOMICS_ALLOWED: cublasAtomicsMode_t = cublasAtomicsMode_t(1);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cublasAtomicsMode_t(pub ::core::ffi::c_uint);
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_DFALT: cublasGemmAlgo_t = cublasGemmAlgo_t(-1);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_DEFAULT: cublasGemmAlgo_t = cublasGemmAlgo_t(-1);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO0: cublasGemmAlgo_t = cublasGemmAlgo_t(0);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO1: cublasGemmAlgo_t = cublasGemmAlgo_t(1);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO2: cublasGemmAlgo_t = cublasGemmAlgo_t(2);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO3: cublasGemmAlgo_t = cublasGemmAlgo_t(3);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO4: cublasGemmAlgo_t = cublasGemmAlgo_t(4);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO5: cublasGemmAlgo_t = cublasGemmAlgo_t(5);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO6: cublasGemmAlgo_t = cublasGemmAlgo_t(6);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO7: cublasGemmAlgo_t = cublasGemmAlgo_t(7);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO8: cublasGemmAlgo_t = cublasGemmAlgo_t(8);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO9: cublasGemmAlgo_t = cublasGemmAlgo_t(9);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO10: cublasGemmAlgo_t = cublasGemmAlgo_t(10);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO11: cublasGemmAlgo_t = cublasGemmAlgo_t(11);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO12: cublasGemmAlgo_t = cublasGemmAlgo_t(12);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO13: cublasGemmAlgo_t = cublasGemmAlgo_t(13);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO14: cublasGemmAlgo_t = cublasGemmAlgo_t(14);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO15: cublasGemmAlgo_t = cublasGemmAlgo_t(15);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO16: cublasGemmAlgo_t = cublasGemmAlgo_t(16);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO17: cublasGemmAlgo_t = cublasGemmAlgo_t(17);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO18: cublasGemmAlgo_t = cublasGemmAlgo_t(18);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO19: cublasGemmAlgo_t = cublasGemmAlgo_t(19);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO20: cublasGemmAlgo_t = cublasGemmAlgo_t(20);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO21: cublasGemmAlgo_t = cublasGemmAlgo_t(21);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO22: cublasGemmAlgo_t = cublasGemmAlgo_t(22);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO23: cublasGemmAlgo_t = cublasGemmAlgo_t(23);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_DEFAULT_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(99);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_DFALT_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(99);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO0_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(100);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO1_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(101);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO2_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(102);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO3_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(103);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO4_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(104);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO5_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(105);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO6_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(106);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO7_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(107);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO8_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(108);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO9_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(109);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO10_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(110);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO11_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(111);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO12_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(112);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO13_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(113);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO14_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(114);
 }
 impl cublasGemmAlgo_t {
    pub const CUBLAS_GEMM_ALGO15_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(115);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cublasGemmAlgo_t(pub ::core::ffi::c_int);
 impl cublasMath_t {
    pub const CUBLAS_DEFAULT_MATH: cublasMath_t = cublasMath_t(0);
 }
 impl cublasMath_t {
    pub const CUBLAS_TENSOR_OP_MATH: cublasMath_t = cublasMath_t(1);
 }
 impl cublasMath_t {
    pub const CUBLAS_PEDANTIC_MATH: cublasMath_t = cublasMath_t(2);
 }
 impl cublasMath_t {
    pub const CUBLAS_TF32_TENSOR_OP_MATH: cublasMath_t = cublasMath_t(3);
 }
 impl cublasMath_t {
    pub const CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION: cublasMath_t = cublasMath_t(
        16,
    );
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cublasMath_t(pub ::core::ffi::c_uint);
 pub use super::cuda::cudaDataType as cublasDataType_t;
 impl cublasComputeType_t {
    pub const CUBLAS_COMPUTE_16F: cublasComputeType_t = cublasComputeType_t(64);
 }
 impl cublasComputeType_t {
    pub const CUBLAS_COMPUTE_16F_PEDANTIC: cublasComputeType_t = cublasComputeType_t(65);
 }
 impl cublasComputeType_t {
    pub const CUBLAS_COMPUTE_32F: cublasComputeType_t = cublasComputeType_t(68);
 }
 impl cublasComputeType_t {
    pub const CUBLAS_COMPUTE_32F_PEDANTIC: cublasComputeType_t = cublasComputeType_t(69);
 }
 impl cublasComputeType_t {
    pub const CUBLAS_COMPUTE_32F_FAST_16F: cublasComputeType_t = cublasComputeType_t(74);
 }
 impl cublasComputeType_t {
    pub const CUBLAS_COMPUTE_32F_FAST_16BF: cublasComputeType_t = cublasComputeType_t(
        75,
    );
 }
 impl cublasComputeType_t {
    pub const CUBLAS_COMPUTE_32F_FAST_TF32: cublasComputeType_t = cublasComputeType_t(
        77,
    );
 }
 impl cublasComputeType_t {
    pub const CUBLAS_COMPUTE_64F: cublasComputeType_t = cublasComputeType_t(70);
 }
 impl cublasComputeType_t {
    pub const CUBLAS_COMPUTE_64F_PEDANTIC: cublasComputeType_t = cublasComputeType_t(71);
 }
 impl cublasComputeType_t {
    pub const CUBLAS_COMPUTE_32I: cublasComputeType_t = cublasComputeType_t(72);
 }
 impl cublasComputeType_t {
    pub const CUBLAS_COMPUTE_32I_PEDANTIC: cublasComputeType_t = cublasComputeType_t(73);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cublasComputeType_t(pub ::core::ffi::c_uint);
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct cublasContext {
    _unused: [u8; 0],
 }
 pub type cublasHandle_t = *mut cublasContext;
 pub type cublasLogCallback = ::core::option::Option<
    unsafe extern "C" fn(msg: *const ::core::ffi::c_char),
 >;
--- a/cuda_types/src/cublaslt.rs
+++ b/cuda_types/src/cublaslt.rs
--- a/cuda_types/src/cuda.rs
+++ b/cuda_types/src/cuda.rs
--- a/cuda_types/src/cudnn.rs
+++ b/cuda_types/src/cudnn.rs
--- a/cuda_types/src/cudnn8.rs
+++ b/cuda_types/src/cudnn8.rs
@ -0,0 +1,576 @@
 // Generated automatically by zluda_bindgen
 // DO NOT EDIT MANUALLY
 #![allow(warnings)]
 pub type __half = u16;
 pub type __nv_bfloat16 = u16;
 pub use super::cuda::cuComplex;
 pub use super::cuda::cuDoubleComplex;
 pub use super::cuda::cudaDataType;
 pub use super::cuda::cudaDataType_t;
 pub type cudaStream_t = super::cuda::CUstream;
 pub use super::cuda::libraryPropertyType;
 pub type cudaGraphExecUpdateResultInfo_st = super::cuda::CUgraphExecUpdateResultInfo_st;
 pub type cudaAsyncNotificationType = super::cuda::CUasyncNotificationType_enum;
 pub type cudaGraph_t = super::cuda::CUgraph;
 pub const CUDNN_MAJOR: u32 = 8;
 pub const CUDNN_MINOR: u32 = 9;
 pub const CUDNN_PATCHLEVEL: u32 = 7;
 pub const CUDNN_VERSION: u32 = 8907;
 pub const CUDNN_MAX_SM_MAJOR_NUMBER: u32 = 9;
 pub const CUDNN_MAX_SM_MINOR_NUMBER: u32 = 0;
 pub const CUDNN_MAX_DEVICE_VERSION: u32 = 900;
 pub const CUDNN_SM_50: u32 = 500;
 pub const CUDNN_SM_52: u32 = 520;
 pub const CUDNN_SM_53: u32 = 530;
 pub const CUDNN_SM_60: u32 = 600;
 pub const CUDNN_SM_61: u32 = 610;
 pub const CUDNN_SM_62: u32 = 620;
 pub const CUDNN_SM_70: u32 = 700;
 pub const CUDNN_SM_72: u32 = 720;
 pub const CUDNN_SM_75: u32 = 750;
 pub const CUDNN_SM_80: u32 = 800;
 pub const CUDNN_SM_86: u32 = 860;
 pub const CUDNN_SM_87: u32 = 870;
 pub const CUDNN_SM_89: u32 = 890;
 pub const CUDNN_SM_90: u32 = 900;
 pub const CUDNN_SM_9X_END: u32 = 999;
 pub const CUDNN_MIN_DEVICE_VERSION: u32 = 500;
 pub const CUDNN_OPS_INFER_MAJOR: u32 = 8;
 pub const CUDNN_OPS_INFER_MINOR: u32 = 9;
 pub const CUDNN_OPS_INFER_PATCH: u32 = 7;
 pub const CUDNN_DIM_MAX: u32 = 8;
 pub const CUDNN_LRN_MIN_N: u32 = 1;
 pub const CUDNN_LRN_MAX_N: u32 = 16;
 pub const CUDNN_LRN_MIN_K: f64 = 0.00001;
 pub const CUDNN_LRN_MIN_BETA: f64 = 0.01;
 pub const CUDNN_BN_MIN_EPSILON: f64 = 0.0;
 pub const CUDNN_OPS_TRAIN_MAJOR: u32 = 8;
 pub const CUDNN_OPS_TRAIN_MINOR: u32 = 9;
 pub const CUDNN_OPS_TRAIN_PATCH: u32 = 7;
 pub const CUDNN_ADV_INFER_MAJOR: u32 = 8;
 pub const CUDNN_ADV_INFER_MINOR: u32 = 9;
 pub const CUDNN_ADV_INFER_PATCH: u32 = 7;
 pub const CUDNN_RNN_PADDED_IO_DISABLED: u32 = 0;
 pub const CUDNN_RNN_PADDED_IO_ENABLED: u32 = 1;
 pub const CUDNN_SEQDATA_DIM_COUNT: u32 = 4;
 pub const CUDNN_ATTN_QUERYMAP_ALL_TO_ONE: u32 = 0;
 pub const CUDNN_ATTN_QUERYMAP_ONE_TO_ONE: u32 = 1;
 pub const CUDNN_ATTN_DISABLE_PROJ_BIASES: u32 = 0;
 pub const CUDNN_ATTN_ENABLE_PROJ_BIASES: u32 = 2;
 pub const CUDNN_ATTN_WKIND_COUNT: u32 = 8;
 pub const CUDNN_ADV_TRAIN_MAJOR: u32 = 8;
 pub const CUDNN_ADV_TRAIN_MINOR: u32 = 9;
 pub const CUDNN_ADV_TRAIN_PATCH: u32 = 7;
 pub const CUDNN_CNN_INFER_MAJOR: u32 = 8;
 pub const CUDNN_CNN_INFER_MINOR: u32 = 9;
 pub const CUDNN_CNN_INFER_PATCH: u32 = 7;
 pub const CUDNN_CNN_TRAIN_MAJOR: u32 = 8;
 pub const CUDNN_CNN_TRAIN_MINOR: u32 = 9;
 pub const CUDNN_CNN_TRAIN_PATCH: u32 = 7;
 pub use super::cudnn::cudnnContext;
 pub type cudnnHandle_t = *mut cudnnContext;
 impl cudnnStatus_t {
    pub const CUDNN_STATUS_SUCCESS: cudnnStatus_t = cudnnStatus_t(0);
 }
 impl cudnnStatus_t {
    pub const CUDNN_STATUS_NOT_INITIALIZED: cudnnStatus_t = cudnnStatus_t(1);
 }
 impl cudnnStatus_t {
    pub const CUDNN_STATUS_ALLOC_FAILED: cudnnStatus_t = cudnnStatus_t(2);
 }
 impl cudnnStatus_t {
    pub const CUDNN_STATUS_BAD_PARAM: cudnnStatus_t = cudnnStatus_t(3);
 }
 impl cudnnStatus_t {
    pub const CUDNN_STATUS_INTERNAL_ERROR: cudnnStatus_t = cudnnStatus_t(4);
 }
 impl cudnnStatus_t {
    pub const CUDNN_STATUS_INVALID_VALUE: cudnnStatus_t = cudnnStatus_t(5);
 }
 impl cudnnStatus_t {
    pub const CUDNN_STATUS_ARCH_MISMATCH: cudnnStatus_t = cudnnStatus_t(6);
 }
 impl cudnnStatus_t {
    pub const CUDNN_STATUS_MAPPING_ERROR: cudnnStatus_t = cudnnStatus_t(7);
 }
 impl cudnnStatus_t {
    pub const CUDNN_STATUS_EXECUTION_FAILED: cudnnStatus_t = cudnnStatus_t(8);
 }
 impl cudnnStatus_t {
    pub const CUDNN_STATUS_NOT_SUPPORTED: cudnnStatus_t = cudnnStatus_t(9);
 }
 impl cudnnStatus_t {
    pub const CUDNN_STATUS_LICENSE_ERROR: cudnnStatus_t = cudnnStatus_t(10);
 }
 impl cudnnStatus_t {
    pub const CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING: cudnnStatus_t = cudnnStatus_t(
        11,
    );
 }
 impl cudnnStatus_t {
    pub const CUDNN_STATUS_RUNTIME_IN_PROGRESS: cudnnStatus_t = cudnnStatus_t(12);
 }
 impl cudnnStatus_t {
    pub const CUDNN_STATUS_RUNTIME_FP_OVERFLOW: cudnnStatus_t = cudnnStatus_t(13);
 }
 impl cudnnStatus_t {
    pub const CUDNN_STATUS_VERSION_MISMATCH: cudnnStatus_t = cudnnStatus_t(14);
 }
 #[repr(transparent)]
 #[must_use]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cudnnStatus_t(pub ::core::ffi::c_uint);
 pub use super::cudnn::cudnnRuntimeTag_t;
 pub use super::cudnn::cudnnErrQueryMode_t;
 pub use super::cudnn::cudnnTensorStruct;
 pub type cudnnTensorDescriptor_t = *mut cudnnTensorStruct;
 pub use super::cudnn::cudnnPoolingStruct;
 pub type cudnnPoolingDescriptor_t = *mut cudnnPoolingStruct;
 pub use super::cudnn::cudnnFilterStruct;
 pub type cudnnFilterDescriptor_t = *mut cudnnFilterStruct;
 pub use super::cudnn::cudnnLRNStruct;
 pub type cudnnLRNDescriptor_t = *mut cudnnLRNStruct;
 pub use super::cudnn::cudnnActivationStruct;
 pub type cudnnActivationDescriptor_t = *mut cudnnActivationStruct;
 pub use super::cudnn::cudnnSpatialTransformerStruct;
 pub type cudnnSpatialTransformerDescriptor_t = *mut cudnnSpatialTransformerStruct;
 pub use super::cudnn::cudnnOpTensorStruct;
 pub type cudnnOpTensorDescriptor_t = *mut cudnnOpTensorStruct;
 pub use super::cudnn::cudnnReduceTensorStruct;
 pub type cudnnReduceTensorDescriptor_t = *mut cudnnReduceTensorStruct;
 pub use super::cudnn::cudnnCTCLossStruct;
 pub type cudnnCTCLossDescriptor_t = *mut cudnnCTCLossStruct;
 pub use super::cudnn::cudnnTensorTransformStruct;
 pub type cudnnTensorTransformDescriptor_t = *mut cudnnTensorTransformStruct;
 pub use super::cudnn9::cudnnDataType_t;
 pub use super::cudnn::cudnnMathType_t;
 pub use super::cudnn::cudnnNanPropagation_t;
 pub use super::cudnn::cudnnDeterminism_t;
 pub use super::cudnn::cudnnTensorFormat_t;
 pub use super::cudnn::cudnnFoldingDirection_t;
 pub use super::cudnn::cudnnOpTensorOp_t;
 pub use super::cudnn::cudnnReduceTensorOp_t;
 pub use super::cudnn::cudnnReduceTensorIndices_t;
 pub use super::cudnn::cudnnIndicesType_t;
 pub use super::cudnn::cudnnSoftmaxAlgorithm_t;
 pub use super::cudnn::cudnnSoftmaxMode_t;
 pub use super::cudnn::cudnnPoolingMode_t;
 pub use super::cudnn::cudnnActivationMode_t;
 pub use super::cudnn::cudnnLRNMode_t;
 pub use super::cudnn::cudnnDivNormMode_t;
 pub use super::cudnn::cudnnBatchNormMode_t;
 pub use super::cudnn::cudnnBatchNormOps_t;
 pub use super::cudnn::cudnnNormMode_t;
 pub use super::cudnn::cudnnNormAlgo_t;
 pub use super::cudnn::cudnnNormOps_t;
 pub use super::cudnn::cudnnSamplerType_t;
 pub use super::cudnn::cudnnDropoutStruct;
 pub type cudnnDropoutDescriptor_t = *mut cudnnDropoutStruct;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct cudnnAlgorithmStruct {
    _unused: [u8; 0],
 }
 pub type cudnnAlgorithmDescriptor_t = *mut cudnnAlgorithmStruct;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct cudnnAlgorithmPerformanceStruct {
    _unused: [u8; 0],
 }
 pub type cudnnAlgorithmPerformance_t = *mut cudnnAlgorithmPerformanceStruct;
 pub use super::cudnn::cudnnConvolutionFwdAlgo_t;
 pub use super::cudnn::cudnnConvolutionBwdFilterAlgo_t;
 pub use super::cudnn::cudnnConvolutionBwdDataAlgo_t;
 pub use super::cudnn::cudnnRNNAlgo_t;
 pub use super::cudnn::cudnnCTCLossAlgo_t;
 #[repr(C)]
 #[derive(Copy, Clone)]
 pub struct cudnnAlgorithmUnionStruct {
    pub algo: cudnnAlgorithmUnionStruct_Algorithm,
 }
 #[repr(C)]
 #[derive(Copy, Clone)]
 pub union cudnnAlgorithmUnionStruct_Algorithm {
    pub convFwdAlgo: cudnnConvolutionFwdAlgo_t,
    pub convBwdFilterAlgo: cudnnConvolutionBwdFilterAlgo_t,
    pub convBwdDataAlgo: cudnnConvolutionBwdDataAlgo_t,
    pub RNNAlgo: cudnnRNNAlgo_t,
    pub CTCLossAlgo: cudnnCTCLossAlgo_t,
 }
 pub type cudnnAlgorithm_t = cudnnAlgorithmUnionStruct;
 pub use super::cudnn::cudnnSeverity_t;
 #[repr(C)]
 pub struct cudnnDebugStruct {
    pub cudnn_version: ::core::ffi::c_uint,
    pub cudnnStatus: cudnnStatus_t,
    pub time_sec: ::core::ffi::c_uint,
    pub time_usec: ::core::ffi::c_uint,
    pub time_delta: ::core::ffi::c_uint,
    pub handle: cudnnHandle_t,
    pub stream: cudaStream_t,
    pub pid: ::core::ffi::c_ulonglong,
    pub tid: ::core::ffi::c_ulonglong,
    pub cudaDeviceId: ::core::ffi::c_int,
    pub reserved: [::core::ffi::c_int; 15usize],
 }
 pub type cudnnDebug_t = cudnnDebugStruct;
 pub type cudnnCallback_t = ::core::option::Option<
    unsafe extern "C" fn(
        sev: cudnnSeverity_t,
        udata: *mut ::core::ffi::c_void,
        dbg: *const cudnnDebug_t,
        msg: *const ::core::ffi::c_char,
    ),
 >;
 pub use super::cudnn::cudnnForwardMode_t;
 pub use super::cudnn::cudnnRNNMode_t;
 pub use super::cudnn::cudnnRNNBiasMode_t;
 pub use super::cudnn::cudnnDirectionMode_t;
 pub use super::cudnn::cudnnRNNInputMode_t;
 pub use super::cudnn::cudnnRNNClipMode_t;
 pub use super::cudnn::cudnnRNNDataLayout_t;
 pub type cudnnRNNPaddingMode_t = ::core::ffi::c_uint;
 pub use super::cudnn::cudnnRNNStruct;
 pub type cudnnRNNDescriptor_t = *mut cudnnRNNStruct;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct cudnnPersistentRNNPlan {
    _unused: [u8; 0],
 }
 pub type cudnnPersistentRNNPlan_t = *mut cudnnPersistentRNNPlan;
 pub use super::cudnn::cudnnRNNDataStruct;
 pub type cudnnRNNDataDescriptor_t = *mut cudnnRNNDataStruct;
 pub use super::cudnn::cudnnSeqDataAxis_t;
 pub use super::cudnn::cudnnSeqDataStruct;
 pub type cudnnSeqDataDescriptor_t = *mut cudnnSeqDataStruct;
 pub type cudnnAttnQueryMap_t = ::core::ffi::c_uint;
 pub use super::cudnn::cudnnAttnStruct;
 pub type cudnnAttnDescriptor_t = *mut cudnnAttnStruct;
 pub use super::cudnn::cudnnMultiHeadAttnWeightKind_t;
 pub use super::cudnn::cudnnWgradMode_t;
 pub use super::cudnn::cudnnLossNormalizationMode_t;
 pub use super::cudnn::cudnnConvolutionStruct;
 pub type cudnnConvolutionDescriptor_t = *mut cudnnConvolutionStruct;
 pub use super::cudnn::cudnnConvolutionMode_t;
 pub use super::cudnn::cudnnReorderType_t;
 #[repr(C)]
 #[derive(Debug, Copy, Clone, PartialEq)]
 pub struct cudnnConvolutionFwdAlgoPerfStruct {
    pub algo: cudnnConvolutionFwdAlgo_t,
    pub status: cudnnStatus_t,
    pub time: f32,
    pub memory: usize,
    pub determinism: cudnnDeterminism_t,
    pub mathType: cudnnMathType_t,
    pub reserved: [::core::ffi::c_int; 3usize],
 }
 pub type cudnnConvolutionFwdAlgoPerf_t = cudnnConvolutionFwdAlgoPerfStruct;
 #[repr(C)]
 #[derive(Debug, Copy, Clone, PartialEq)]
 pub struct cudnnConvolutionBwdDataAlgoPerfStruct {
    pub algo: cudnnConvolutionBwdDataAlgo_t,
    pub status: cudnnStatus_t,
    pub time: f32,
    pub memory: usize,
    pub determinism: cudnnDeterminism_t,
    pub mathType: cudnnMathType_t,
    pub reserved: [::core::ffi::c_int; 3usize],
 }
 pub type cudnnConvolutionBwdDataAlgoPerf_t = cudnnConvolutionBwdDataAlgoPerfStruct;
 pub use super::cudnn::cudnnFusedOpsConstParamStruct;
 pub type cudnnFusedOpsConstParamPack_t = *mut cudnnFusedOpsConstParamStruct;
 pub use super::cudnn::cudnnFusedOpsVariantParamStruct;
 pub type cudnnFusedOpsVariantParamPack_t = *mut cudnnFusedOpsVariantParamStruct;
 pub use super::cudnn::cudnnFusedOpsPlanStruct;
 pub type cudnnFusedOpsPlan_t = *mut cudnnFusedOpsPlanStruct;
 pub use super::cudnn::cudnnFusedOps_t;
 pub use super::cudnn::cudnnFusedOpsConstParamLabel_t;
 pub use super::cudnn::cudnnFusedOpsPointerPlaceHolder_t;
 pub use super::cudnn::cudnnFusedOpsVariantParamLabel_t;
 #[repr(C)]
 #[derive(Debug, Copy, Clone, PartialEq)]
 pub struct cudnnConvolutionBwdFilterAlgoPerfStruct {
    pub algo: cudnnConvolutionBwdFilterAlgo_t,
    pub status: cudnnStatus_t,
    pub time: f32,
    pub memory: usize,
    pub determinism: cudnnDeterminism_t,
    pub mathType: cudnnMathType_t,
    pub reserved: [::core::ffi::c_int; 3usize],
 }
 pub type cudnnConvolutionBwdFilterAlgoPerf_t = cudnnConvolutionBwdFilterAlgoPerfStruct;
 pub type cudnnBackendDescriptor_t = *mut ::core::ffi::c_void;
 pub use super::cudnn::cudnnFractionStruct;
 pub type cudnnFraction_t = cudnnFractionStruct;
 pub use super::cudnn9::cudnnPointwiseMode_t;
 pub use super::cudnn::cudnnResampleMode_t;
 pub use super::cudnn::cudnnSignalMode_t;
 pub use super::cudnn::cudnnGenStatsMode_t;
 pub use super::cudnn::cudnnBnFinalizeStatsMode_t;
 pub use super::cudnn::cudnnRngDistribution_t;
 pub use super::cudnn9::cudnnBackendAttributeName_t;
 pub use super::cudnn::cudnnBackendAttributeType_t;
 pub use super::cudnn9::cudnnBackendDescriptorType_t;
 impl cudnnBackendNumericalNote_t {
    pub const CUDNN_NUMERICAL_NOTE_TENSOR_CORE: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
        0,
    );
 }
 impl cudnnBackendNumericalNote_t {
    pub const CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
        1,
    );
 }
 impl cudnnBackendNumericalNote_t {
    pub const CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
        2,
    );
 }
 impl cudnnBackendNumericalNote_t {
    pub const CUDNN_NUMERICAL_NOTE_FFT: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
        3,
    );
 }
 impl cudnnBackendNumericalNote_t {
    pub const CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
        4,
    );
 }
 impl cudnnBackendNumericalNote_t {
    pub const CUDNN_NUMERICAL_NOTE_WINOGRAD: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
        5,
    );
 }
 impl cudnnBackendNumericalNote_t {
    pub const CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
        6,
    );
 }
 impl cudnnBackendNumericalNote_t {
    pub const CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
        7,
    );
 }
 impl cudnnBackendNumericalNote_t {
    pub const CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
        8,
    );
 }
 impl cudnnBackendNumericalNote_t {
    pub const CUDNN_NUMERICAL_NOTE_TYPE_COUNT: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
        9,
    );
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cudnnBackendNumericalNote_t(pub ::core::ffi::c_uint);
 impl cudnnBackendBehaviorNote_t {
    pub const CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION: cudnnBackendBehaviorNote_t = cudnnBackendBehaviorNote_t(
        0,
    );
 }
 impl cudnnBackendBehaviorNote_t {
    pub const CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER: cudnnBackendBehaviorNote_t = cudnnBackendBehaviorNote_t(
        1,
    );
 }
 impl cudnnBackendBehaviorNote_t {
    pub const CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER: cudnnBackendBehaviorNote_t = cudnnBackendBehaviorNote_t(
        2,
    );
 }
 impl cudnnBackendBehaviorNote_t {
    pub const CUDNN_BEHAVIOR_NOTE_TYPE_COUNT: cudnnBackendBehaviorNote_t = cudnnBackendBehaviorNote_t(
        3,
    );
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cudnnBackendBehaviorNote_t(pub ::core::ffi::c_uint);
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_SPLIT_K: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        0,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_SWIZZLE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        1,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_TILE_SIZE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        2,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_USE_TEX: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        3,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_EDGE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(4);
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_KBLOCK: cudnnBackendKnobType_t = cudnnBackendKnobType_t(5);
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_LDGA: cudnnBackendKnobType_t = cudnnBackendKnobType_t(6);
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_LDGB: cudnnBackendKnobType_t = cudnnBackendKnobType_t(7);
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_CHUNK_K: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        8,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_SPLIT_H: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        9,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_WINO_TILE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        10,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_MULTIPLY: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        11,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_SPLIT_K_BUF: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        12,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_TILEK: cudnnBackendKnobType_t = cudnnBackendKnobType_t(13);
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_STAGES: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        14,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_REDUCTION_MODE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        15,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        16,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_SPLIT_K_SLC: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        17,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_IDX_MODE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        18,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_SLICED: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        19,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_SPLIT_RS: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        20,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_SINGLEBUFFER: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        21,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_LDGC: cudnnBackendKnobType_t = cudnnBackendKnobType_t(22);
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_SPECFILT: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        23,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_KERNEL_CFG: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        24,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_WORKSPACE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        25,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_TILE_CGA: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        26,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_TILE_CGA_M: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        27,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_TILE_CGA_N: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        28,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_BLOCK_SIZE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        29,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_OCCUPANCY: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        30,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_ARRAY_SIZE_PER_THREAD: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        31,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_NUM_C_PER_BLOCK: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        32,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_SPLIT_COLS: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        33,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_TILE_ROWS: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        34,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_TILE_COLS: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        35,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_LOAD_SIZE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        36,
    );
 }
 impl cudnnBackendKnobType_t {
    pub const CUDNN_KNOB_TYPE_COUNTS: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
        37,
    );
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cudnnBackendKnobType_t(pub ::core::ffi::c_uint);
 pub use super::cudnn::cudnnBackendLayoutType_t;
 pub use super::cudnn::cudnnBackendHeurMode_t;
 pub use super::cudnn9::cudnnBackendTensorReordering_t;
 pub use super::cudnn::cudnnPaddingMode_t;
 pub use super::cudnn9::cudnnBackendNormMode_t;
 pub use super::cudnn::cudnnBackendNormFwdPhase_t;
--- a/cuda_types/src/cudnn9.rs
+++ b/cuda_types/src/cudnn9.rs
--- a/cuda_types/src/cufft.rs
+++ b/cuda_types/src/cufft.rs
@ -0,0 +1,427 @@
 // Generated automatically by zluda_bindgen
 // DO NOT EDIT MANUALLY
 #![allow(warnings)]
 pub type __half = u16;
 pub type __nv_bfloat16 = u16;
 pub use super::cuda::cuComplex;
 pub use super::cuda::cuDoubleComplex;
 pub use super::cuda::cudaDataType;
 pub use super::cuda::cudaDataType_t;
 pub type cudaStream_t = super::cuda::CUstream;
 pub use super::cuda::libraryPropertyType;
 pub type cudaGraphExecUpdateResultInfo_st = super::cuda::CUgraphExecUpdateResultInfo_st;
 pub type cudaAsyncNotificationType = super::cuda::CUasyncNotificationType_enum;
 pub type cudaGraph_t = super::cuda::CUgraph;
 pub const CUFFT_VER_MAJOR: u32 = 11;
 pub const CUFFT_VER_MINOR: u32 = 3;
 pub const CUFFT_VER_PATCH: u32 = 3;
 pub const CUFFT_VER_BUILD: u32 = 83;
 pub const CUFFT_VERSION: u32 = 11303;
 pub const CUFFT_FORWARD: i32 = -1;
 pub const CUFFT_INVERSE: u32 = 1;
 impl libFormat_t {
    pub const LIB_FORMAT_CUFFT: libFormat_t = libFormat_t(0);
 }
 impl libFormat_t {
    pub const LIB_FORMAT_UNDEFINED: libFormat_t = libFormat_t(1);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct libFormat_t(pub ::core::ffi::c_uint);
 pub use self::libFormat_t as libFormat;
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cudaXtDesc_t {
    pub version: ::core::ffi::c_int,
    pub nGPUs: ::core::ffi::c_int,
    pub GPUs: [::core::ffi::c_int; 64usize],
    pub data: [*mut ::core::ffi::c_void; 64usize],
    pub size: [usize; 64usize],
    pub cudaXtState: *mut ::core::ffi::c_void,
 }
 pub type cudaXtDesc = cudaXtDesc_t;
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cudaLibXtDesc_t {
    pub version: ::core::ffi::c_int,
    pub descriptor: *mut cudaXtDesc,
    pub library: libFormat,
    pub subFormat: ::core::ffi::c_int,
    pub libDescriptor: *mut ::core::ffi::c_void,
 }
 pub type cudaLibXtDesc = cudaLibXtDesc_t;
 impl cufftResult_t {
    pub const CUFFT_SUCCESS: cufftResult_t = cufftResult_t(0);
 }
 impl cufftResult_t {
    pub const CUFFT_INVALID_PLAN: cufftResult_t = cufftResult_t(1);
 }
 impl cufftResult_t {
    pub const CUFFT_ALLOC_FAILED: cufftResult_t = cufftResult_t(2);
 }
 impl cufftResult_t {
    pub const CUFFT_INVALID_TYPE: cufftResult_t = cufftResult_t(3);
 }
 impl cufftResult_t {
    pub const CUFFT_INVALID_VALUE: cufftResult_t = cufftResult_t(4);
 }
 impl cufftResult_t {
    pub const CUFFT_INTERNAL_ERROR: cufftResult_t = cufftResult_t(5);
 }
 impl cufftResult_t {
    pub const CUFFT_EXEC_FAILED: cufftResult_t = cufftResult_t(6);
 }
 impl cufftResult_t {
    pub const CUFFT_SETUP_FAILED: cufftResult_t = cufftResult_t(7);
 }
 impl cufftResult_t {
    pub const CUFFT_INVALID_SIZE: cufftResult_t = cufftResult_t(8);
 }
 impl cufftResult_t {
    pub const CUFFT_UNALIGNED_DATA: cufftResult_t = cufftResult_t(9);
 }
 impl cufftResult_t {
    pub const CUFFT_INCOMPLETE_PARAMETER_LIST: cufftResult_t = cufftResult_t(10);
 }
 impl cufftResult_t {
    pub const CUFFT_INVALID_DEVICE: cufftResult_t = cufftResult_t(11);
 }
 impl cufftResult_t {
    pub const CUFFT_PARSE_ERROR: cufftResult_t = cufftResult_t(12);
 }
 impl cufftResult_t {
    pub const CUFFT_NO_WORKSPACE: cufftResult_t = cufftResult_t(13);
 }
 impl cufftResult_t {
    pub const CUFFT_NOT_IMPLEMENTED: cufftResult_t = cufftResult_t(14);
 }
 impl cufftResult_t {
    pub const CUFFT_LICENSE_ERROR: cufftResult_t = cufftResult_t(15);
 }
 impl cufftResult_t {
    pub const CUFFT_NOT_SUPPORTED: cufftResult_t = cufftResult_t(16);
 }
 #[repr(transparent)]
 #[must_use]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cufftResult_t(pub ::core::ffi::c_uint);
 pub use self::cufftResult_t as cufftResult;
 pub type cufftReal = f32;
 pub type cufftDoubleReal = f64;
 pub type cufftComplex = super::cuda::cuComplex;
 pub type cufftDoubleComplex = super::cuda::cuDoubleComplex;
 impl cufftType_t {
    pub const CUFFT_R2C: cufftType_t = cufftType_t(42);
 }
 impl cufftType_t {
    pub const CUFFT_C2R: cufftType_t = cufftType_t(44);
 }
 impl cufftType_t {
    pub const CUFFT_C2C: cufftType_t = cufftType_t(41);
 }
 impl cufftType_t {
    pub const CUFFT_D2Z: cufftType_t = cufftType_t(106);
 }
 impl cufftType_t {
    pub const CUFFT_Z2D: cufftType_t = cufftType_t(108);
 }
 impl cufftType_t {
    pub const CUFFT_Z2Z: cufftType_t = cufftType_t(105);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cufftType_t(pub ::core::ffi::c_uint);
 pub use self::cufftType_t as cufftType;
 impl cufftCompatibility_t {
    pub const CUFFT_COMPATIBILITY_FFTW_PADDING: cufftCompatibility_t = cufftCompatibility_t(
        1,
    );
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cufftCompatibility_t(pub ::core::ffi::c_uint);
 pub use self::cufftCompatibility_t as cufftCompatibility;
 pub type cufftHandle = ::core::ffi::c_int;
 impl cufftProperty_t {
    pub const NVFFT_PLAN_PROPERTY_INT64_PATIENT_JIT: cufftProperty_t = cufftProperty_t(
        1,
    );
 }
 impl cufftProperty_t {
    pub const NVFFT_PLAN_PROPERTY_INT64_MAX_NUM_HOST_THREADS: cufftProperty_t = cufftProperty_t(
        2,
    );
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cufftProperty_t(pub ::core::ffi::c_uint);
 pub use self::cufftProperty_t as cufftProperty;
 impl cufftXtSubFormat_t {
    pub const CUFFT_XT_FORMAT_INPUT: cufftXtSubFormat_t = cufftXtSubFormat_t(0);
 }
 impl cufftXtSubFormat_t {
    pub const CUFFT_XT_FORMAT_OUTPUT: cufftXtSubFormat_t = cufftXtSubFormat_t(1);
 }
 impl cufftXtSubFormat_t {
    pub const CUFFT_XT_FORMAT_INPLACE: cufftXtSubFormat_t = cufftXtSubFormat_t(2);
 }
 impl cufftXtSubFormat_t {
    pub const CUFFT_XT_FORMAT_INPLACE_SHUFFLED: cufftXtSubFormat_t = cufftXtSubFormat_t(
        3,
    );
 }
 impl cufftXtSubFormat_t {
    pub const CUFFT_XT_FORMAT_1D_INPUT_SHUFFLED: cufftXtSubFormat_t = cufftXtSubFormat_t(
        4,
    );
 }
 impl cufftXtSubFormat_t {
    pub const CUFFT_XT_FORMAT_DISTRIBUTED_INPUT: cufftXtSubFormat_t = cufftXtSubFormat_t(
        5,
    );
 }
 impl cufftXtSubFormat_t {
    pub const CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT: cufftXtSubFormat_t = cufftXtSubFormat_t(
        6,
    );
 }
 impl cufftXtSubFormat_t {
    pub const CUFFT_FORMAT_UNDEFINED: cufftXtSubFormat_t = cufftXtSubFormat_t(7);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cufftXtSubFormat_t(pub ::core::ffi::c_uint);
 pub use self::cufftXtSubFormat_t as cufftXtSubFormat;
 impl cufftXtCopyType_t {
    pub const CUFFT_COPY_HOST_TO_DEVICE: cufftXtCopyType_t = cufftXtCopyType_t(0);
 }
 impl cufftXtCopyType_t {
    pub const CUFFT_COPY_DEVICE_TO_HOST: cufftXtCopyType_t = cufftXtCopyType_t(1);
 }
 impl cufftXtCopyType_t {
    pub const CUFFT_COPY_DEVICE_TO_DEVICE: cufftXtCopyType_t = cufftXtCopyType_t(2);
 }
 impl cufftXtCopyType_t {
    pub const CUFFT_COPY_UNDEFINED: cufftXtCopyType_t = cufftXtCopyType_t(3);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cufftXtCopyType_t(pub ::core::ffi::c_uint);
 pub use self::cufftXtCopyType_t as cufftXtCopyType;
 impl cufftXtQueryType_t {
    pub const CUFFT_QUERY_1D_FACTORS: cufftXtQueryType_t = cufftXtQueryType_t(0);
 }
 impl cufftXtQueryType_t {
    pub const CUFFT_QUERY_UNDEFINED: cufftXtQueryType_t = cufftXtQueryType_t(1);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cufftXtQueryType_t(pub ::core::ffi::c_uint);
 pub use self::cufftXtQueryType_t as cufftXtQueryType;
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cufftXt1dFactors_t {
    pub size: ::core::ffi::c_longlong,
    pub stringCount: ::core::ffi::c_longlong,
    pub stringLength: ::core::ffi::c_longlong,
    pub substringLength: ::core::ffi::c_longlong,
    pub factor1: ::core::ffi::c_longlong,
    pub factor2: ::core::ffi::c_longlong,
    pub stringMask: ::core::ffi::c_longlong,
    pub substringMask: ::core::ffi::c_longlong,
    pub factor1Mask: ::core::ffi::c_longlong,
    pub factor2Mask: ::core::ffi::c_longlong,
    pub stringShift: ::core::ffi::c_int,
    pub substringShift: ::core::ffi::c_int,
    pub factor1Shift: ::core::ffi::c_int,
    pub factor2Shift: ::core::ffi::c_int,
 }
 pub type cufftXt1dFactors = cufftXt1dFactors_t;
 impl cufftXtWorkAreaPolicy_t {
    pub const CUFFT_WORKAREA_MINIMAL: cufftXtWorkAreaPolicy_t = cufftXtWorkAreaPolicy_t(
        0,
    );
 }
 impl cufftXtWorkAreaPolicy_t {
    pub const CUFFT_WORKAREA_USER: cufftXtWorkAreaPolicy_t = cufftXtWorkAreaPolicy_t(1);
 }
 impl cufftXtWorkAreaPolicy_t {
    pub const CUFFT_WORKAREA_PERFORMANCE: cufftXtWorkAreaPolicy_t = cufftXtWorkAreaPolicy_t(
        2,
    );
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cufftXtWorkAreaPolicy_t(pub ::core::ffi::c_uint);
 pub use self::cufftXtWorkAreaPolicy_t as cufftXtWorkAreaPolicy;
 impl cufftXtCallbackType_t {
    pub const CUFFT_CB_LD_COMPLEX: cufftXtCallbackType_t = cufftXtCallbackType_t(0);
 }
 impl cufftXtCallbackType_t {
    pub const CUFFT_CB_LD_COMPLEX_DOUBLE: cufftXtCallbackType_t = cufftXtCallbackType_t(
        1,
    );
 }
 impl cufftXtCallbackType_t {
    pub const CUFFT_CB_LD_REAL: cufftXtCallbackType_t = cufftXtCallbackType_t(2);
 }
 impl cufftXtCallbackType_t {
    pub const CUFFT_CB_LD_REAL_DOUBLE: cufftXtCallbackType_t = cufftXtCallbackType_t(3);
 }
 impl cufftXtCallbackType_t {
    pub const CUFFT_CB_ST_COMPLEX: cufftXtCallbackType_t = cufftXtCallbackType_t(4);
 }
 impl cufftXtCallbackType_t {
    pub const CUFFT_CB_ST_COMPLEX_DOUBLE: cufftXtCallbackType_t = cufftXtCallbackType_t(
        5,
    );
 }
 impl cufftXtCallbackType_t {
    pub const CUFFT_CB_ST_REAL: cufftXtCallbackType_t = cufftXtCallbackType_t(6);
 }
 impl cufftXtCallbackType_t {
    pub const CUFFT_CB_ST_REAL_DOUBLE: cufftXtCallbackType_t = cufftXtCallbackType_t(7);
 }
 impl cufftXtCallbackType_t {
    pub const CUFFT_CB_UNDEFINED: cufftXtCallbackType_t = cufftXtCallbackType_t(8);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cufftXtCallbackType_t(pub ::core::ffi::c_uint);
 pub use self::cufftXtCallbackType_t as cufftXtCallbackType;
 pub type cufftCallbackLoadC = ::core::option::Option<
    unsafe extern "C" fn(
        dataIn: *mut ::core::ffi::c_void,
        offset: usize,
        callerInfo: *mut ::core::ffi::c_void,
        sharedPointer: *mut ::core::ffi::c_void,
    ) -> cufftComplex,
 >;
 pub type cufftCallbackLoadZ = ::core::option::Option<
    unsafe extern "C" fn(
        dataIn: *mut ::core::ffi::c_void,
        offset: usize,
        callerInfo: *mut ::core::ffi::c_void,
        sharedPointer: *mut ::core::ffi::c_void,
    ) -> cufftDoubleComplex,
 >;
 pub type cufftCallbackLoadR = ::core::option::Option<
    unsafe extern "C" fn(
        dataIn: *mut ::core::ffi::c_void,
        offset: usize,
        callerInfo: *mut ::core::ffi::c_void,
        sharedPointer: *mut ::core::ffi::c_void,
    ) -> cufftReal,
 >;
 pub type cufftCallbackLoadD = ::core::option::Option<
    unsafe extern "C" fn(
        dataIn: *mut ::core::ffi::c_void,
        offset: usize,
        callerInfo: *mut ::core::ffi::c_void,
        sharedPointer: *mut ::core::ffi::c_void,
    ) -> cufftDoubleReal,
 >;
 pub type cufftCallbackStoreC = ::core::option::Option<
    unsafe extern "C" fn(
        dataOut: *mut ::core::ffi::c_void,
        offset: usize,
        element: cufftComplex,
        callerInfo: *mut ::core::ffi::c_void,
        sharedPointer: *mut ::core::ffi::c_void,
    ),
 >;
 pub type cufftCallbackStoreZ = ::core::option::Option<
    unsafe extern "C" fn(
        dataOut: *mut ::core::ffi::c_void,
        offset: usize,
        element: cufftDoubleComplex,
        callerInfo: *mut ::core::ffi::c_void,
        sharedPointer: *mut ::core::ffi::c_void,
    ),
 >;
 pub type cufftCallbackStoreR = ::core::option::Option<
    unsafe extern "C" fn(
        dataOut: *mut ::core::ffi::c_void,
        offset: usize,
        element: cufftReal,
        callerInfo: *mut ::core::ffi::c_void,
        sharedPointer: *mut ::core::ffi::c_void,
    ),
 >;
 pub type cufftCallbackStoreD = ::core::option::Option<
    unsafe extern "C" fn(
        dataOut: *mut ::core::ffi::c_void,
        offset: usize,
        element: cufftDoubleReal,
        callerInfo: *mut ::core::ffi::c_void,
        sharedPointer: *mut ::core::ffi::c_void,
    ),
 >;
 pub type cufftJITCallbackLoadC = ::core::option::Option<
    unsafe extern "C" fn(
        dataIn: *mut ::core::ffi::c_void,
        offset: ::core::ffi::c_ulonglong,
        callerInfo: *mut ::core::ffi::c_void,
        sharedPointer: *mut ::core::ffi::c_void,
    ) -> cufftComplex,
 >;
 pub type cufftJITCallbackLoadZ = ::core::option::Option<
    unsafe extern "C" fn(
        dataIn: *mut ::core::ffi::c_void,
        offset: ::core::ffi::c_ulonglong,
        callerInfo: *mut ::core::ffi::c_void,
        sharedPointer: *mut ::core::ffi::c_void,
    ) -> cufftDoubleComplex,
 >;
 pub type cufftJITCallbackLoadR = ::core::option::Option<
    unsafe extern "C" fn(
        dataIn: *mut ::core::ffi::c_void,
        offset: ::core::ffi::c_ulonglong,
        callerInfo: *mut ::core::ffi::c_void,
        sharedPointer: *mut ::core::ffi::c_void,
    ) -> cufftReal,
 >;
 pub type cufftJITCallbackLoadD = ::core::option::Option<
    unsafe extern "C" fn(
        dataIn: *mut ::core::ffi::c_void,
        offset: ::core::ffi::c_ulonglong,
        callerInfo: *mut ::core::ffi::c_void,
        sharedPointer: *mut ::core::ffi::c_void,
    ) -> cufftDoubleReal,
 >;
 pub type cufftJITCallbackStoreC = ::core::option::Option<
    unsafe extern "C" fn(
        dataOut: *mut ::core::ffi::c_void,
        offset: ::core::ffi::c_ulonglong,
        element: cufftComplex,
        callerInfo: *mut ::core::ffi::c_void,
        sharedPointer: *mut ::core::ffi::c_void,
    ),
 >;
 pub type cufftJITCallbackStoreZ = ::core::option::Option<
    unsafe extern "C" fn(
        dataOut: *mut ::core::ffi::c_void,
        offset: ::core::ffi::c_ulonglong,
        element: cufftDoubleComplex,
        callerInfo: *mut ::core::ffi::c_void,
        sharedPointer: *mut ::core::ffi::c_void,
    ),
 >;
 pub type cufftJITCallbackStoreR = ::core::option::Option<
    unsafe extern "C" fn(
        dataOut: *mut ::core::ffi::c_void,
        offset: ::core::ffi::c_ulonglong,
        element: cufftReal,
        callerInfo: *mut ::core::ffi::c_void,
        sharedPointer: *mut ::core::ffi::c_void,
    ),
 >;
 pub type cufftJITCallbackStoreD = ::core::option::Option<
    unsafe extern "C" fn(
        dataOut: *mut ::core::ffi::c_void,
        offset: ::core::ffi::c_ulonglong,
        element: cufftDoubleReal,
        callerInfo: *mut ::core::ffi::c_void,
        sharedPointer: *mut ::core::ffi::c_void,
    ),
 >;
--- a/cuda_types/src/cusparse.rs
+++ b/cuda_types/src/cusparse.rs
@ -0,0 +1,532 @@
 // Generated automatically by zluda_bindgen
 // DO NOT EDIT MANUALLY
 #![allow(warnings)]
 pub type __half = u16;
 pub type __nv_bfloat16 = u16;
 pub use super::cuda::cuComplex;
 pub use super::cuda::cuDoubleComplex;
 pub use super::cuda::cudaDataType;
 pub use super::cuda::cudaDataType_t;
 pub type cudaStream_t = super::cuda::CUstream;
 pub use super::cuda::libraryPropertyType;
 pub type cudaGraphExecUpdateResultInfo_st = super::cuda::CUgraphExecUpdateResultInfo_st;
 pub type cudaAsyncNotificationType = super::cuda::CUasyncNotificationType_enum;
 pub type cudaGraph_t = super::cuda::CUgraph;
 pub const CUSPARSE_VER_MAJOR: u32 = 12;
 pub const CUSPARSE_VER_MINOR: u32 = 5;
 pub const CUSPARSE_VER_PATCH: u32 = 8;
 pub const CUSPARSE_VER_BUILD: u32 = 93;
 pub const CUSPARSE_VERSION: u32 = 12508;
 /// Result information returned by cudaGraphExecUpdate
 pub type cudaGraphExecUpdateResultInfo = cudaGraphExecUpdateResultInfo_st;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct cusparseContext {
    _unused: [u8; 0],
 }
 pub type cusparseHandle_t = *mut cusparseContext;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct cusparseMatDescr {
    _unused: [u8; 0],
 }
 pub type cusparseMatDescr_t = *mut cusparseMatDescr;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct bsrsv2Info {
    _unused: [u8; 0],
 }
 pub type bsrsv2Info_t = *mut bsrsv2Info;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct bsrsm2Info {
    _unused: [u8; 0],
 }
 pub type bsrsm2Info_t = *mut bsrsm2Info;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct csric02Info {
    _unused: [u8; 0],
 }
 pub type csric02Info_t = *mut csric02Info;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct bsric02Info {
    _unused: [u8; 0],
 }
 pub type bsric02Info_t = *mut bsric02Info;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct csrilu02Info {
    _unused: [u8; 0],
 }
 pub type csrilu02Info_t = *mut csrilu02Info;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct bsrilu02Info {
    _unused: [u8; 0],
 }
 pub type bsrilu02Info_t = *mut bsrilu02Info;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct csru2csrInfo {
    _unused: [u8; 0],
 }
 pub type csru2csrInfo_t = *mut csru2csrInfo;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct cusparseColorInfo {
    _unused: [u8; 0],
 }
 pub type cusparseColorInfo_t = *mut cusparseColorInfo;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct pruneInfo {
    _unused: [u8; 0],
 }
 pub type pruneInfo_t = *mut pruneInfo;
 impl cusparseStatus_t {
    pub const CUSPARSE_STATUS_SUCCESS: cusparseStatus_t = cusparseStatus_t(0);
 }
 impl cusparseStatus_t {
    pub const CUSPARSE_STATUS_NOT_INITIALIZED: cusparseStatus_t = cusparseStatus_t(1);
 }
 impl cusparseStatus_t {
    pub const CUSPARSE_STATUS_ALLOC_FAILED: cusparseStatus_t = cusparseStatus_t(2);
 }
 impl cusparseStatus_t {
    pub const CUSPARSE_STATUS_INVALID_VALUE: cusparseStatus_t = cusparseStatus_t(3);
 }
 impl cusparseStatus_t {
    pub const CUSPARSE_STATUS_ARCH_MISMATCH: cusparseStatus_t = cusparseStatus_t(4);
 }
 impl cusparseStatus_t {
    pub const CUSPARSE_STATUS_MAPPING_ERROR: cusparseStatus_t = cusparseStatus_t(5);
 }
 impl cusparseStatus_t {
    pub const CUSPARSE_STATUS_EXECUTION_FAILED: cusparseStatus_t = cusparseStatus_t(6);
 }
 impl cusparseStatus_t {
    pub const CUSPARSE_STATUS_INTERNAL_ERROR: cusparseStatus_t = cusparseStatus_t(7);
 }
 impl cusparseStatus_t {
    pub const CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: cusparseStatus_t = cusparseStatus_t(
        8,
    );
 }
 impl cusparseStatus_t {
    pub const CUSPARSE_STATUS_ZERO_PIVOT: cusparseStatus_t = cusparseStatus_t(9);
 }
 impl cusparseStatus_t {
    pub const CUSPARSE_STATUS_NOT_SUPPORTED: cusparseStatus_t = cusparseStatus_t(10);
 }
 impl cusparseStatus_t {
    pub const CUSPARSE_STATUS_INSUFFICIENT_RESOURCES: cusparseStatus_t = cusparseStatus_t(
        11,
    );
 }
 #[repr(transparent)]
 #[must_use]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseStatus_t(pub ::core::ffi::c_uint);
 impl cusparsePointerMode_t {
    pub const CUSPARSE_POINTER_MODE_HOST: cusparsePointerMode_t = cusparsePointerMode_t(
        0,
    );
 }
 impl cusparsePointerMode_t {
    pub const CUSPARSE_POINTER_MODE_DEVICE: cusparsePointerMode_t = cusparsePointerMode_t(
        1,
    );
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparsePointerMode_t(pub ::core::ffi::c_uint);
 impl cusparseAction_t {
    pub const CUSPARSE_ACTION_SYMBOLIC: cusparseAction_t = cusparseAction_t(0);
 }
 impl cusparseAction_t {
    pub const CUSPARSE_ACTION_NUMERIC: cusparseAction_t = cusparseAction_t(1);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseAction_t(pub ::core::ffi::c_uint);
 impl cusparseMatrixType_t {
    pub const CUSPARSE_MATRIX_TYPE_GENERAL: cusparseMatrixType_t = cusparseMatrixType_t(
        0,
    );
 }
 impl cusparseMatrixType_t {
    pub const CUSPARSE_MATRIX_TYPE_SYMMETRIC: cusparseMatrixType_t = cusparseMatrixType_t(
        1,
    );
 }
 impl cusparseMatrixType_t {
    pub const CUSPARSE_MATRIX_TYPE_HERMITIAN: cusparseMatrixType_t = cusparseMatrixType_t(
        2,
    );
 }
 impl cusparseMatrixType_t {
    pub const CUSPARSE_MATRIX_TYPE_TRIANGULAR: cusparseMatrixType_t = cusparseMatrixType_t(
        3,
    );
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseMatrixType_t(pub ::core::ffi::c_uint);
 impl cusparseFillMode_t {
    pub const CUSPARSE_FILL_MODE_LOWER: cusparseFillMode_t = cusparseFillMode_t(0);
 }
 impl cusparseFillMode_t {
    pub const CUSPARSE_FILL_MODE_UPPER: cusparseFillMode_t = cusparseFillMode_t(1);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseFillMode_t(pub ::core::ffi::c_uint);
 impl cusparseDiagType_t {
    pub const CUSPARSE_DIAG_TYPE_NON_UNIT: cusparseDiagType_t = cusparseDiagType_t(0);
 }
 impl cusparseDiagType_t {
    pub const CUSPARSE_DIAG_TYPE_UNIT: cusparseDiagType_t = cusparseDiagType_t(1);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseDiagType_t(pub ::core::ffi::c_uint);
 impl cusparseIndexBase_t {
    pub const CUSPARSE_INDEX_BASE_ZERO: cusparseIndexBase_t = cusparseIndexBase_t(0);
 }
 impl cusparseIndexBase_t {
    pub const CUSPARSE_INDEX_BASE_ONE: cusparseIndexBase_t = cusparseIndexBase_t(1);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseIndexBase_t(pub ::core::ffi::c_uint);
 impl cusparseOperation_t {
    pub const CUSPARSE_OPERATION_NON_TRANSPOSE: cusparseOperation_t = cusparseOperation_t(
        0,
    );
 }
 impl cusparseOperation_t {
    pub const CUSPARSE_OPERATION_TRANSPOSE: cusparseOperation_t = cusparseOperation_t(1);
 }
 impl cusparseOperation_t {
    pub const CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE: cusparseOperation_t = cusparseOperation_t(
        2,
    );
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseOperation_t(pub ::core::ffi::c_uint);
 impl cusparseDirection_t {
    pub const CUSPARSE_DIRECTION_ROW: cusparseDirection_t = cusparseDirection_t(0);
 }
 impl cusparseDirection_t {
    pub const CUSPARSE_DIRECTION_COLUMN: cusparseDirection_t = cusparseDirection_t(1);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseDirection_t(pub ::core::ffi::c_uint);
 impl cusparseSolvePolicy_t {
    pub const CUSPARSE_SOLVE_POLICY_NO_LEVEL: cusparseSolvePolicy_t = cusparseSolvePolicy_t(
        0,
    );
 }
 impl cusparseSolvePolicy_t {
    pub const CUSPARSE_SOLVE_POLICY_USE_LEVEL: cusparseSolvePolicy_t = cusparseSolvePolicy_t(
        1,
    );
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseSolvePolicy_t(pub ::core::ffi::c_uint);
 impl cusparseColorAlg_t {
    pub const CUSPARSE_COLOR_ALG0: cusparseColorAlg_t = cusparseColorAlg_t(0);
 }
 impl cusparseColorAlg_t {
    pub const CUSPARSE_COLOR_ALG1: cusparseColorAlg_t = cusparseColorAlg_t(1);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseColorAlg_t(pub ::core::ffi::c_uint);
 pub type cusparseLoggerCallback_t = ::core::option::Option<
    unsafe extern "C" fn(
        logLevel: ::core::ffi::c_int,
        functionName: *const ::core::ffi::c_char,
        message: *const ::core::ffi::c_char,
    ),
 >;
 impl cusparseCsr2CscAlg_t {
    pub const CUSPARSE_CSR2CSC_ALG_DEFAULT: cusparseCsr2CscAlg_t = cusparseCsr2CscAlg_t(
        1,
    );
 }
 impl cusparseCsr2CscAlg_t {
    pub const CUSPARSE_CSR2CSC_ALG1: cusparseCsr2CscAlg_t = cusparseCsr2CscAlg_t(1);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseCsr2CscAlg_t(pub ::core::ffi::c_uint);
 impl cusparseFormat_t {
    ///< Compressed Sparse Row (CSR)
    pub const CUSPARSE_FORMAT_CSR: cusparseFormat_t = cusparseFormat_t(1);
 }
 impl cusparseFormat_t {
    ///< Compressed Sparse Column (CSC)
    pub const CUSPARSE_FORMAT_CSC: cusparseFormat_t = cusparseFormat_t(2);
 }
 impl cusparseFormat_t {
    ///< Coordinate (COO) - Structure of Arrays
    pub const CUSPARSE_FORMAT_COO: cusparseFormat_t = cusparseFormat_t(3);
 }
 impl cusparseFormat_t {
    ///< Blocked ELL
    pub const CUSPARSE_FORMAT_BLOCKED_ELL: cusparseFormat_t = cusparseFormat_t(5);
 }
 impl cusparseFormat_t {
    ///< Blocked Compressed Sparse Row (BSR)
    pub const CUSPARSE_FORMAT_BSR: cusparseFormat_t = cusparseFormat_t(6);
 }
 impl cusparseFormat_t {
    ///< Sliced ELL
    pub const CUSPARSE_FORMAT_SLICED_ELLPACK: cusparseFormat_t = cusparseFormat_t(7);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseFormat_t(pub ::core::ffi::c_uint);
 impl cusparseOrder_t {
    ///< Column-Major Order - Matrix memory layout
    pub const CUSPARSE_ORDER_COL: cusparseOrder_t = cusparseOrder_t(1);
 }
 impl cusparseOrder_t {
    ///< Row-Major Order - Matrix memory layout
    pub const CUSPARSE_ORDER_ROW: cusparseOrder_t = cusparseOrder_t(2);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseOrder_t(pub ::core::ffi::c_uint);
 impl cusparseIndexType_t {
    /**< 16-bit unsigned integer for matrix/vector
 < indices*/
    pub const CUSPARSE_INDEX_16U: cusparseIndexType_t = cusparseIndexType_t(1);
 }
 impl cusparseIndexType_t {
    ///< 32-bit signed integer for matrix/vector indices
    pub const CUSPARSE_INDEX_32I: cusparseIndexType_t = cusparseIndexType_t(2);
 }
 impl cusparseIndexType_t {
    ///< 64-bit signed integer for matrix/vector indices
    pub const CUSPARSE_INDEX_64I: cusparseIndexType_t = cusparseIndexType_t(3);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseIndexType_t(pub ::core::ffi::c_uint);
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct cusparseSpVecDescr {
    _unused: [u8; 0],
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct cusparseDnVecDescr {
    _unused: [u8; 0],
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct cusparseSpMatDescr {
    _unused: [u8; 0],
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct cusparseDnMatDescr {
    _unused: [u8; 0],
 }
 pub type cusparseSpVecDescr_t = *mut cusparseSpVecDescr;
 pub type cusparseDnVecDescr_t = *mut cusparseDnVecDescr;
 pub type cusparseSpMatDescr_t = *mut cusparseSpMatDescr;
 pub type cusparseDnMatDescr_t = *mut cusparseDnMatDescr;
 pub type cusparseConstSpVecDescr_t = *const cusparseSpVecDescr;
 pub type cusparseConstDnVecDescr_t = *const cusparseDnVecDescr;
 pub type cusparseConstSpMatDescr_t = *const cusparseSpMatDescr;
 pub type cusparseConstDnMatDescr_t = *const cusparseDnMatDescr;
 impl cusparseSpMatAttribute_t {
    pub const CUSPARSE_SPMAT_FILL_MODE: cusparseSpMatAttribute_t = cusparseSpMatAttribute_t(
        0,
    );
 }
 impl cusparseSpMatAttribute_t {
    pub const CUSPARSE_SPMAT_DIAG_TYPE: cusparseSpMatAttribute_t = cusparseSpMatAttribute_t(
        1,
    );
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseSpMatAttribute_t(pub ::core::ffi::c_uint);
 impl cusparseSparseToDenseAlg_t {
    pub const CUSPARSE_SPARSETODENSE_ALG_DEFAULT: cusparseSparseToDenseAlg_t = cusparseSparseToDenseAlg_t(
        0,
    );
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseSparseToDenseAlg_t(pub ::core::ffi::c_uint);
 impl cusparseDenseToSparseAlg_t {
    pub const CUSPARSE_DENSETOSPARSE_ALG_DEFAULT: cusparseDenseToSparseAlg_t = cusparseDenseToSparseAlg_t(
        0,
    );
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseDenseToSparseAlg_t(pub ::core::ffi::c_uint);
 impl cusparseSpMVAlg_t {
    pub const CUSPARSE_SPMV_ALG_DEFAULT: cusparseSpMVAlg_t = cusparseSpMVAlg_t(0);
 }
 impl cusparseSpMVAlg_t {
    pub const CUSPARSE_SPMV_CSR_ALG1: cusparseSpMVAlg_t = cusparseSpMVAlg_t(2);
 }
 impl cusparseSpMVAlg_t {
    pub const CUSPARSE_SPMV_CSR_ALG2: cusparseSpMVAlg_t = cusparseSpMVAlg_t(3);
 }
 impl cusparseSpMVAlg_t {
    pub const CUSPARSE_SPMV_COO_ALG1: cusparseSpMVAlg_t = cusparseSpMVAlg_t(1);
 }
 impl cusparseSpMVAlg_t {
    pub const CUSPARSE_SPMV_COO_ALG2: cusparseSpMVAlg_t = cusparseSpMVAlg_t(4);
 }
 impl cusparseSpMVAlg_t {
    pub const CUSPARSE_SPMV_SELL_ALG1: cusparseSpMVAlg_t = cusparseSpMVAlg_t(5);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseSpMVAlg_t(pub ::core::ffi::c_uint);
 impl cusparseSpSVAlg_t {
    pub const CUSPARSE_SPSV_ALG_DEFAULT: cusparseSpSVAlg_t = cusparseSpSVAlg_t(0);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseSpSVAlg_t(pub ::core::ffi::c_uint);
 impl cusparseSpSVUpdate_t {
    pub const CUSPARSE_SPSV_UPDATE_GENERAL: cusparseSpSVUpdate_t = cusparseSpSVUpdate_t(
        0,
    );
 }
 impl cusparseSpSVUpdate_t {
    pub const CUSPARSE_SPSV_UPDATE_DIAGONAL: cusparseSpSVUpdate_t = cusparseSpSVUpdate_t(
        1,
    );
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseSpSVUpdate_t(pub ::core::ffi::c_uint);
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct cusparseSpSVDescr {
    _unused: [u8; 0],
 }
 pub type cusparseSpSVDescr_t = *mut cusparseSpSVDescr;
 impl cusparseSpSMAlg_t {
    pub const CUSPARSE_SPSM_ALG_DEFAULT: cusparseSpSMAlg_t = cusparseSpSMAlg_t(0);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseSpSMAlg_t(pub ::core::ffi::c_uint);
 impl cusparseSpSMUpdate_t {
    pub const CUSPARSE_SPSM_UPDATE_GENERAL: cusparseSpSMUpdate_t = cusparseSpSMUpdate_t(
        0,
    );
 }
 impl cusparseSpSMUpdate_t {
    pub const CUSPARSE_SPSM_UPDATE_DIAGONAL: cusparseSpSMUpdate_t = cusparseSpSMUpdate_t(
        1,
    );
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseSpSMUpdate_t(pub ::core::ffi::c_uint);
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct cusparseSpSMDescr {
    _unused: [u8; 0],
 }
 pub type cusparseSpSMDescr_t = *mut cusparseSpSMDescr;
 impl cusparseSpMMAlg_t {
    pub const CUSPARSE_SPMM_ALG_DEFAULT: cusparseSpMMAlg_t = cusparseSpMMAlg_t(0);
 }
 impl cusparseSpMMAlg_t {
    pub const CUSPARSE_SPMM_COO_ALG1: cusparseSpMMAlg_t = cusparseSpMMAlg_t(1);
 }
 impl cusparseSpMMAlg_t {
    pub const CUSPARSE_SPMM_COO_ALG2: cusparseSpMMAlg_t = cusparseSpMMAlg_t(2);
 }
 impl cusparseSpMMAlg_t {
    pub const CUSPARSE_SPMM_COO_ALG3: cusparseSpMMAlg_t = cusparseSpMMAlg_t(3);
 }
 impl cusparseSpMMAlg_t {
    pub const CUSPARSE_SPMM_COO_ALG4: cusparseSpMMAlg_t = cusparseSpMMAlg_t(5);
 }
 impl cusparseSpMMAlg_t {
    pub const CUSPARSE_SPMM_CSR_ALG1: cusparseSpMMAlg_t = cusparseSpMMAlg_t(4);
 }
 impl cusparseSpMMAlg_t {
    pub const CUSPARSE_SPMM_CSR_ALG2: cusparseSpMMAlg_t = cusparseSpMMAlg_t(6);
 }
 impl cusparseSpMMAlg_t {
    pub const CUSPARSE_SPMM_CSR_ALG3: cusparseSpMMAlg_t = cusparseSpMMAlg_t(12);
 }
 impl cusparseSpMMAlg_t {
    pub const CUSPARSE_SPMM_BLOCKED_ELL_ALG1: cusparseSpMMAlg_t = cusparseSpMMAlg_t(13);
 }
 impl cusparseSpMMAlg_t {
    pub const CUSPARSE_SPMM_BSR_ALG1: cusparseSpMMAlg_t = cusparseSpMMAlg_t(14);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseSpMMAlg_t(pub ::core::ffi::c_uint);
 impl cusparseSpGEMMAlg_t {
    pub const CUSPARSE_SPGEMM_DEFAULT: cusparseSpGEMMAlg_t = cusparseSpGEMMAlg_t(0);
 }
 impl cusparseSpGEMMAlg_t {
    pub const CUSPARSE_SPGEMM_CSR_ALG_DETERMINITIC: cusparseSpGEMMAlg_t = cusparseSpGEMMAlg_t(
        1,
    );
 }
 impl cusparseSpGEMMAlg_t {
    pub const CUSPARSE_SPGEMM_CSR_ALG_NONDETERMINITIC: cusparseSpGEMMAlg_t = cusparseSpGEMMAlg_t(
        2,
    );
 }
 impl cusparseSpGEMMAlg_t {
    pub const CUSPARSE_SPGEMM_ALG1: cusparseSpGEMMAlg_t = cusparseSpGEMMAlg_t(3);
 }
 impl cusparseSpGEMMAlg_t {
    pub const CUSPARSE_SPGEMM_ALG2: cusparseSpGEMMAlg_t = cusparseSpGEMMAlg_t(4);
 }
 impl cusparseSpGEMMAlg_t {
    pub const CUSPARSE_SPGEMM_ALG3: cusparseSpGEMMAlg_t = cusparseSpGEMMAlg_t(5);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseSpGEMMAlg_t(pub ::core::ffi::c_uint);
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct cusparseSpGEMMDescr {
    _unused: [u8; 0],
 }
 pub type cusparseSpGEMMDescr_t = *mut cusparseSpGEMMDescr;
 impl cusparseSDDMMAlg_t {
    pub const CUSPARSE_SDDMM_ALG_DEFAULT: cusparseSDDMMAlg_t = cusparseSDDMMAlg_t(0);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseSDDMMAlg_t(pub ::core::ffi::c_uint);
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct cusparseSpMMOpPlan {
    _unused: [u8; 0],
 }
 pub type cusparseSpMMOpPlan_t = *mut cusparseSpMMOpPlan;
 impl cusparseSpMMOpAlg_t {
    pub const CUSPARSE_SPMM_OP_ALG_DEFAULT: cusparseSpMMOpAlg_t = cusparseSpMMOpAlg_t(0);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct cusparseSpMMOpAlg_t(pub ::core::ffi::c_uint);
--- a/cuda_types/src/dark_api.rs
+++ b/cuda_types/src/dark_api.rs
@ -0,0 +1,94 @@
 use bitflags::bitflags;
 use std::ffi::{c_uint, c_ulonglong, c_ushort, c_void};
 /*
 fat_cubin:
 typedef struct {
  int magic;
  int version;
  const unsigned long long* data;
  void *filename_or_fatbins;  /* version 1: offline filename,
                               * version 2: array of prelinked fatbins */
 } __fatBinC_Wrapper_t;
 data start with this header:
 #define FATBIN_MAGIC 0xBA55ED50U
 #define OLD_STYLE_FATBIN_MAGIC 0x1EE55A01U
 #define FATBIN_VERSION 0x0001U
 struct fatbinary_ALIGN_(8) fatBinaryHeader
 {
  unsigned int           magic;   // FATBIN_MAGIC
  unsigned short         version; // FATBIN_VERSION
  unsigned short         headerSize;
  unsigned long long int fatSize; // size of the entire fat binary excluding this header
 };
 there's binary data after header
 */
 #[repr(C)]
 pub struct FatbincWrapper {
    pub magic: c_uint,
    pub version: c_uint,
    pub data: *const FatbinHeader,
    pub filename_or_fatbins: *const c_void,
 }
 #[repr(C, align(8))]
 pub struct FatbinHeader {
    pub magic: c_uint,
    pub version: c_ushort,
    pub header_size: c_ushort,
    pub files_size: c_ulonglong, // excluding frame header, size of all blocks framed by this frame
 }
 #[repr(C)]
 pub struct FatbinFileHeader {
    pub kind: c_ushort,
    pub version: c_ushort,
    pub header_size: c_uint,
    pub padded_payload_size: c_uint,
    pub unknown0: c_uint, // check if it's written into separately
    pub payload_size: c_uint,
    pub unknown1: c_uint,
    pub unknown2: c_uint,
    pub sm_version: c_uint,
    pub bit_width: c_uint,
    pub unknown3: c_uint,
    pub flags: FatbinFileHeaderFlags,
    pub unknown5: c_ulonglong,
    pub uncompressed_payload: c_ulonglong,
 }
 bitflags! {
    pub struct FatbinFileHeaderFlags: u64 {
        const Is64Bit = 0x0000000000000001;
        const Debug = 0x0000000000000002;
        const Linux = 0x0000000000000010;
        const Mac = 0x0000000000000020;
        const Windows = 0x0000000000000040;
        const CompressedLz4 = 0x0000000000002000;
        const CompressedZstd = 0x0000000000008000;
        const _ = !0;
    }
 }
 impl FatbincWrapper {
    pub const MAGIC: c_uint = 0x466243B1;
    pub const VERSION_V1: c_uint = 0x1;
    pub const VERSION_V2: c_uint = 0x2;
 }
 impl FatbinHeader {
    pub const MAGIC: c_uint = 0xBA55ED50;
    pub const VERSION: c_ushort = 0x01;
 }
 impl FatbinFileHeader {
    pub const HEADER_KIND_PTX: c_ushort = 0x01;
    pub const HEADER_KIND_ELF: c_ushort = 0x02;
    pub const HEADER_VERSION_CURRENT: c_ushort = 0x101;
 }
--- a/cuda_types/src/lib.rs
+++ b/cuda_types/src/lib.rs
@ -1,2 +1,12 @@
 pub enum FILE {}
 pub mod cublas;
 pub mod cublaslt;
 pub mod cuda;
-pub mod nvml;
+pub mod cudnn;
 pub mod cudnn8;
 pub mod cudnn9;
 pub mod cufft;
 pub mod cusparse;
 pub mod nvml;
 pub mod dark_api;
--- a/cuda_types/src/nvml.rs
+++ b/cuda_types/src/nvml.rs
--- a/dark_api/Cargo.toml
+++ b/dark_api/Cargo.toml
@ -0,0 +1,14 @@
 [package]
 name = "dark_api"
 version = "0.0.0"
 edition = "2021"
 [dependencies]
 cuda_types = { path = "../cuda_types" }
 format = { path = "../format" }
 uuid = "1.16"
 paste = "1.0"
 bit-vec = "0.8.0"
 cglue = "0.3.5"
 lz4-sys = "1.9"
 zstd-safe = { version = "7.2.4", features = ["std"] }
--- a/dark_api/src/fatbin.rs
+++ b/dark_api/src/fatbin.rs
@ -0,0 +1,259 @@
 // This file contains a higher-level interface for parsing fatbins
 use std::ptr;
 use cuda_types::dark_api::*;
 pub enum ParseError {
    NullPointer(&'static str),
    UnexpectedBinaryField {
        field_name: &'static str,
        observed: u32,
        expected: Vec<u32>,
    },
 }
 impl ParseError {
    pub(crate) fn check_fields<const N: usize, T: Into<u32> + Eq + Copy>(
        name: &'static str,
        observed: T,
        expected: [T; N],
    ) -> Result<(), Self> {
        if expected.contains(&observed) {
            Ok(())
        } else {
            let observed = observed.into();
            let expected = expected.into_iter().map(Into::into).collect();
            Err(ParseError::UnexpectedBinaryField {
                field_name: name,
                expected,
                observed,
            })
        }
    }
 }
 pub enum FatbinError {
    ParseFailure(ParseError),
    Lz4DecompressionFailure,
    ZstdDecompressionFailure(usize),
 }
 pub fn parse_fatbinc_wrapper<T: Sized>(ptr: &*const T) -> Result<&FatbincWrapper, ParseError> {
    unsafe { ptr.cast::<FatbincWrapper>().as_ref() }
        .ok_or(ParseError::NullPointer("FatbincWrapper"))
        .and_then(|ptr| {
            ParseError::check_fields("FATBINC_MAGIC", ptr.magic, [FatbincWrapper::MAGIC])?;
            ParseError::check_fields(
                "FATBINC_VERSION",
                ptr.version,
                [FatbincWrapper::VERSION_V1, FatbincWrapper::VERSION_V2],
            )?;
            Ok(ptr)
        })
 }
 fn parse_fatbin_header<T: Sized>(ptr: &*const T) -> Result<&FatbinHeader, ParseError> {
    unsafe { ptr.cast::<FatbinHeader>().as_ref() }
        .ok_or(ParseError::NullPointer("FatbinHeader"))
        .and_then(|ptr| {
            ParseError::check_fields("FATBIN_MAGIC", ptr.magic, [FatbinHeader::MAGIC])?;
            ParseError::check_fields("FATBIN_VERSION", ptr.version, [FatbinHeader::VERSION])?;
            Ok(ptr)
        })
 }
 pub struct Fatbin<'a> {
    pub wrapper: &'a FatbincWrapper,
 }
 impl<'a> Fatbin<'a> {
    pub fn new<T>(ptr: &'a *const T) -> Result<Self, FatbinError> {
        let wrapper: &FatbincWrapper =
            parse_fatbinc_wrapper(ptr).map_err(|e| FatbinError::ParseFailure(e))?;
        Ok(Fatbin { wrapper })
    }
    pub fn get_submodules(&self) -> Result<FatbinIter<'a>, FatbinError> {
        match self.wrapper.version {
            FatbincWrapper::VERSION_V2 => 
                Ok(FatbinIter::V2(FatbinSubmoduleIterator {
                    fatbins: self.wrapper.filename_or_fatbins as *const *const std::ffi::c_void,
                    _phantom: std::marker::PhantomData,
                })),
            FatbincWrapper::VERSION_V1 => {
                let header = parse_fatbin_header(&self.wrapper.data)
                    .map_err(FatbinError::ParseFailure)?;
                Ok(FatbinIter::V1(Some(FatbinSubmodule::new(header))))
            }
            version => Err(FatbinError::ParseFailure(ParseError::UnexpectedBinaryField{
                field_name: "FATBINC_VERSION",
                observed: version,
                expected: [FatbincWrapper::VERSION_V1, FatbincWrapper::VERSION_V2].into(),
            })),
        }
    }
 }
 pub struct FatbinSubmodule<'a> {
    pub header: &'a FatbinHeader, // TODO: maybe make private
 }
 impl<'a> FatbinSubmodule<'a> {
    pub fn new(header: &'a FatbinHeader) -> Self {
        FatbinSubmodule { header }
    }
    pub fn get_files(&self) -> FatbinFileIterator {
        unsafe { FatbinFileIterator::new(self.header) }
    }
 }
 pub enum FatbinIter<'a> {
    V1(Option<FatbinSubmodule<'a>>),
    V2(FatbinSubmoduleIterator<'a>),
 }
 impl<'a> FatbinIter<'a> {
    pub fn next(&mut self) -> Result<Option<FatbinSubmodule<'a>>, ParseError> {
        match self {
            FatbinIter::V1(opt) => Ok(opt.take()),
            FatbinIter::V2(iter) => unsafe { iter.next() },
        }
    }
 }
 pub struct FatbinSubmoduleIterator<'a> {
    fatbins: *const *const std::ffi::c_void,
    _phantom: std::marker::PhantomData<&'a FatbinHeader>,
 }
 impl<'a> FatbinSubmoduleIterator<'a> {
    pub unsafe fn next(&mut self) -> Result<Option<FatbinSubmodule<'a>>, ParseError> {
        if *self.fatbins != ptr::null() {
            let header = *self.fatbins as *const FatbinHeader;
            self.fatbins = self.fatbins.add(1);
            Ok(Some(FatbinSubmodule::new(header.as_ref().ok_or(
                ParseError::NullPointer("FatbinSubmoduleIterator"),
            )?)))
        } else {
            Ok(None)
        }
    }
 }
 pub struct FatbinFile<'a> {
    pub header: &'a FatbinFileHeader,
 }
 impl<'a> FatbinFile<'a> {
    pub fn new(header: &'a FatbinFileHeader) -> Self {
        Self { header }
    }
    pub unsafe fn get_payload(&'a self) -> &'a [u8] {
        let start = std::ptr::from_ref(self.header)
            .cast::<u8>()
            .add(self.header.header_size as usize);
        std::slice::from_raw_parts(start, self.header.payload_size as usize)
    }
    pub unsafe fn decompress(&'a self) -> Result<Vec<u8>, FatbinError> {
        let mut payload = if self
            .header
            .flags
            .contains(FatbinFileHeaderFlags::CompressedLz4)
        {
            unsafe { decompress_lz4(self) }?
        } else if self
            .header
            .flags
            .contains(FatbinFileHeaderFlags::CompressedZstd)
        {
            unsafe { decompress_zstd(self) }?
        } else {
            unsafe { self.get_payload().to_vec() }
        };
        while payload.last() == Some(&0) {
            // remove trailing zeros
            payload.pop();
        }
        Ok(payload)
    }
 }
 pub struct FatbinFileIterator<'a> {
    file_buffer: &'a [u8],
 }
 impl<'a> FatbinFileIterator<'a> {
    pub unsafe fn new(header: &'a FatbinHeader) -> Self {
        let start = std::ptr::from_ref(header)
            .cast::<u8>()
            .add(header.header_size as usize);
        let file_buffer = std::slice::from_raw_parts(start, header.files_size as usize);
        Self { file_buffer }
    }
    pub unsafe fn next(&mut self) -> Result<Option<FatbinFile>, ParseError> {
        if self.file_buffer.len() < std::mem::size_of::<FatbinFileHeader>() {
            return Ok(None);
        }
        let this = &*self.file_buffer.as_ptr().cast::<FatbinFileHeader>();
        let next_element = self
            .file_buffer
            .split_at_checked(this.header_size as usize + this.padded_payload_size as usize)
            .map(|(_, next)| next);
        self.file_buffer = next_element.unwrap_or(&[]);
        ParseError::check_fields(
            "FATBIN_FILE_HEADER_VERSION_CURRENT",
            this.version,
            [FatbinFileHeader::HEADER_VERSION_CURRENT],
        )?;
        Ok(Some(FatbinFile::new(this)))
    }
 }
 const MAX_MODULE_DECOMPRESSION_BOUND: usize = 64 * 1024 * 1024;
 pub unsafe fn decompress_lz4(file: &FatbinFile) -> Result<Vec<u8>, FatbinError> {
    let decompressed_size = usize::max(1024, file.header.uncompressed_payload as usize);
    let mut decompressed_vec = vec![0u8; decompressed_size];
    loop {
        match lz4_sys::LZ4_decompress_safe(
            file.get_payload().as_ptr() as *const _,
            decompressed_vec.as_mut_ptr() as *mut _,
            file.header.payload_size as _,
            decompressed_vec.len() as _,
        ) {
            error if error < 0 => {
                let new_size = decompressed_vec.len() * 2;
                if new_size > MAX_MODULE_DECOMPRESSION_BOUND {
                    return Err(FatbinError::Lz4DecompressionFailure);
                }
                decompressed_vec.resize(decompressed_vec.len() * 2, 0);
            }
            real_decompressed_size => {
                decompressed_vec.truncate(real_decompressed_size as usize);
                return Ok(decompressed_vec);
            }
        }
    }
 }
 pub unsafe fn decompress_zstd(file: &FatbinFile) -> Result<Vec<u8>, FatbinError> {
    let mut result = Vec::with_capacity(file.header.uncompressed_payload as usize);
    let payload = file.get_payload();
    match zstd_safe::decompress(&mut result, payload) {
        Ok(actual_size) => {
            result.truncate(actual_size);
            Ok(result)
        }
        Err(err) => Err(FatbinError::ZstdDecompressionFailure(err)),
    }
 }
--- a/dark_api/src/lib.rs
+++ b/dark_api/src/lib.rs
@ -0,0 +1,714 @@
 use std::ffi::c_void;
 use cuda_types::cuda::CUuuid;
 pub mod fatbin;
 macro_rules! dark_api_init {
    (SIZE_OF, $table_len:literal, $type_:ty) => {
        (std::mem::size_of::<usize>() * $table_len) as *const std::ffi::c_void
    };
    (NULL, $table_len:literal, $type_:ty) => {
        std::ptr::null()
    };
    ($fn_:ident, $table_len:literal, $type_:ty) => {
        <$type_>::$fn_ as *const std::ffi::c_void
    };
 }
 macro_rules! dark_api_fn {
    (SIZE_OF) => { };
    (NULL) => { };
    ($fn_:ident ( $($arg_id:ident: $arg_type:ty),* ) -> $ret_type:ty) => {
        unsafe extern "system" fn $fn_(
            $($arg_id : $arg_type,)*
        ) -> $ret_type;
    }
 }
 macro_rules! dark_api_entry {
    ($idx:literal, SIZE_OF) => { };
    ($idx:literal, NULL) => { };
    ($idx:literal, $fn_:ident ( $($arg_id:ident: $arg_type:ty),* ) -> $ret_type:ty) => {
        #[allow(non_snake_case)]
        pub unsafe fn $fn_(
            &self,
            $($arg_id : $arg_type,)*
        ) -> $ret_type {
            let ptr = self.ptr as *const *const std::ffi::c_void;
            let ptr = ptr.add($idx);
            let fn_ = std::mem::transmute::<_, unsafe extern "system" fn( $($arg_type,)* ) -> $ret_type >(*ptr);
            (fn_)( $($arg_id,)* )
        }
    }
 }
 macro_rules! dark_api_format_args {
    ($writer:ident; $arg_idx:ident; $first_arg:ident $(, $arg_id:ident)*) => {
        $writer.write_all(concat!(stringify!($first_arg), ": ").as_bytes())?;
        format::CudaDisplay::write(& $first_arg, "", $arg_idx, $writer)?;
        $(
            $arg_idx += 1;
            $writer.write_all(concat!(", ", stringify!($arg_id), ": ").as_bytes())?;
            format::CudaDisplay::write(& $arg_id, "", $arg_idx, $writer)?;
        )*
    };
    ($writer:ident; $arg_idx:ident; ) => {
    };
 }
 macro_rules! dark_api_is_fn {
    (SIZE_OF) => {
        false
    };
    (NULL) => {
        false
    };
    ($fn_:ident) => {
        true
    };
 }
 macro_rules! dark_api_format_fn {
    (SIZE_OF) => { };
    (NULL) => { };
    (#[noformat] $fn_:ident ( $($arg_id:ident: $arg_type:ty),* ) -> $ret_type:ty) => { };
    ($fn_:ident ( $($arg_id:ident: $arg_type:ty),* ) -> $ret_type:ty) => {
        pub fn $fn_ (
            writer: &mut (impl std::io::Write + ?Sized),
            $($arg_id: $arg_type,)*
        ) -> std::io::Result<()> {
            #[allow(unused)]
            let mut arg_idx = 0usize;
            writer.write_all(b"(")?;
            dark_api_format_args!(writer; arg_idx; $($arg_id),*);
            writer.write_all(b")")
        }
    }
 }
 macro_rules! dark_api {
    (
        $mod_name: ident;
        $(
            $guid:expr => $name:ident [$len:literal] {
                $(
                    $(#[$attr:ident])?
                    [$index:literal] = $fn_:ident $( ( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:ty )?
                ),*
            }
        ),+
    ) => {
        pub mod $mod_name {
        #[allow(non_snake_case)]
        pub struct CudaDarkApiGlobalTable {
            $(pub $name: [*const std::ffi::c_void; $len],)+
        }
        impl CudaDarkApiGlobalTable {
            $(const $name: cuda_types::cuda::CUuuid = cuda_types::cuda::CUuuid { bytes: *uuid::uuid!($guid).as_bytes() };)+
        }
        unsafe impl Sync for CudaDarkApiGlobalTable {}
        impl CudaDarkApiGlobalTable {
            pub const fn new<T: CudaDarkApi>() -> Self {
                let mut result = Self {
                    $(
                        $name: [std::ptr::null(); $len],
                    )+
                };
                $(
                    $( result.$name[$index] =  dark_api_init!($fn_, $len, T); )*
                )+
                result
            }
            pub fn get(&self, key: &cuda_types::cuda::CUuuid) -> Option<crate::DarkApiTable> {
                match key {
                    $(
                        &Self::$name => {
                            let fns = &self.$name[..];
                            let mut valid_fns = bit_vec::BitVec::from_elem($len, false);
                            $(
                                valid_fns.set($index, dark_api_is_fn!($fn_) );
                            )*
                            Some(crate::DarkApiTable {
                                fns,
                                valid_fns
                            })
                        }
                    )+
                    _ => None
                }
            }
        }
        pub trait CudaDarkApi {
            $($(
                dark_api_fn!($fn_ $( ( $($arg_id: $arg_type),* ) -> $ret_type )?);
            )*)+
        }
        pub fn guid_to_name(guid: &cuda_types::cuda::CUuuid, index: usize) -> Option<(&'static str, Option<&'static str>)> {
            let guid = uuid::Uuid::from_bytes(guid.bytes);
            $(
                if guid == uuid::uuid!($guid) {
                    let guid = stringify!($name);
                    $(
                        if index == $index {
                            return Some((guid, Some(stringify!($fn_))));
                        }
                    )*
                    return Some((guid, None));
                }
            )+
            None
        }
        $(
            paste::paste! {
                pub struct [<$name:camel>] {
                    #[allow(dead_code)]
                    ptr: *const std::ffi::c_void
                }
                impl [<$name:camel>] {
                    pub const GUID: cuda_types::cuda::CUuuid = CudaDarkApiGlobalTable::$name;
                    pub unsafe fn new(ptr: *const std::ffi::c_void) -> Self {
                        Self {
                            ptr
                        }
                    }
                    $(
                        dark_api_entry!($index, $fn_ $( ( $($arg_id: $arg_type),* ) -> $ret_type )?);
                    )*
                }
            }
        )+
        pub mod format {
            $($(
                dark_api_format_fn!($(#[$attr])? $fn_ $( ( $($arg_id: $arg_type),* ) -> $ret_type )? );
            )*)+
        }
        }
    };
 }
 pub struct DarkApiTable<'a> {
    fns: &'a [*const std::ffi::c_void],
    valid_fns: bit_vec::BitVec,
 }
 impl<'a> DarkApiTable<'a> {
    pub fn len(&self) -> usize {
        self.fns.len()
    }
    pub fn get_fn(&self, idx: usize) -> Option<*const std::ffi::c_void> {
        if self.valid_fns.get(idx).unwrap_or(false) {
            Some(self.fns[idx])
        } else {
            None
        }
    }
    pub fn start(&self) -> *const std::ffi::c_void {
        self.fns.as_ptr().cast()
    }
 }
 dark_api! {
    cuda;
    "{6BD5FB6C-5BF4-E74A-8987-D93912FD9DF9}" => CUDART_INTERFACE[10] {
        [0] = SIZE_OF,
        [1] = get_module_from_cubin(
            module: *mut cuda_types::cuda::CUmodule,
            fatbinc_wrapper: *const cuda_types::dark_api::FatbincWrapper
        ) -> (),
        [2] = cudart_interface_fn2(
            pctx: *mut cuda_types::cuda::CUcontext,
            dev: cuda_types::cuda::CUdevice
        ) -> cuda_types::cuda::CUresult,
        [6] = get_module_from_cubin_ext1(
            result: *mut cuda_types::cuda::CUmodule,
            fatbinc_wrapper: *const cuda_types::dark_api::FatbincWrapper,
            arg3: *mut std::ffi::c_void,
            arg4: *mut std::ffi::c_void,
            arg5: u32
        ) -> cuda_types::cuda::CUresult,
        [7] = cudart_interface_fn7(arg1: usize) -> cuda_types::cuda::CUresult,
        [8] = get_module_from_cubin_ext2(
            fatbin_header: *const cuda_types::dark_api::FatbinHeader,
            result: *mut cuda_types::cuda::CUmodule,
            arg3: *mut std::ffi::c_void,
            arg4: *mut std::ffi::c_void,
            arg5: u32
        ) -> cuda_types::cuda::CUresult
    },
    "{42D85A81-23F6-CB47-8298-F6E78A3AECDC}" => TOOLS_TLS[4] {
        [0] = SIZE_OF
    },
    "{A094798C-2E74-2E74-93F2-0800200C0A66}" => TOOLS_RUNTIME_CALLBACK_HOOKS[7] {
        [0] = SIZE_OF,
        [2] = get_unknown_buffer1(ptr: *mut *mut std::ffi::c_void, size: *mut usize) -> (),
        [6] = get_unknown_buffer2(ptr: *mut *mut std::ffi::c_void, size: *mut usize) -> ()
    },
    "{C693336E-1121-DF11-A8C3-68F355D89593}" => CONTEXT_LOCAL_STORAGE_INTERFACE_V0301[4] {
        [0] = context_local_storage_ctor(
            context: cuda_types::cuda::CUcontext,
            manager: *mut std::ffi::c_void, // ContextStateManager
            ctx_state: *mut std::ffi::c_void, // ContextState
            // clsContextDestroyCallback, have to be called on cuDevicePrimaryCtxReset
            dtor_cb: Option<extern "system" fn(
                cuda_types::cuda::CUcontext,
                *mut std::ffi::c_void, // ContextStateManager
                *mut std::ffi::c_void, // ContextState
            )>
        ) -> cuda_types::cuda::CUresult,
        [1] = context_local_storage_dtor(
            arg1: *mut std::ffi::c_void,
            arg2: *mut std::ffi::c_void
        ) -> cuda_types::cuda::CUresult,
        [2] = context_local_storage_get_state(
            ctx_state: *mut std::ffi::c_void, // ContextState
            cu_ctx: cuda_types::cuda::CUcontext,
            manager: *mut std::ffi::c_void // ContextStateManager
        ) -> cuda_types::cuda::CUresult
    },
    "{0CA50B8C-1004-929A-89A7-D0DF10E77286}" => CTX_CREATE_BYPASS[2] {
        [0] = SIZE_OF,
        [1] = ctx_create_v2_bypass(
            pctx: *mut cuda_types::cuda::CUcontext,
            flags: ::std::os::raw::c_uint,
            dev: cuda_types::cuda::CUdevice
        ) -> cuda_types::cuda::CUresult
    },
    "{195BCBF4-D67D-024A-ACC5-1D29CEA631AE}" => HEAP_ACCESS[3] {
        [0] = SIZE_OF,
        [1] = heap_alloc(
            heap_alloc_record_ptr: *mut *const std::ffi::c_void, // HeapAllocRecord
            arg2: usize,
            arg3: usize
        ) -> cuda_types::cuda::CUresult,
        [2] = heap_free(
            heap_alloc_record_ptr: *const std::ffi::c_void, // HeapAllocRecord
            arg2: *mut usize
        ) -> cuda_types::cuda::CUresult
    },
    "{B10541E1-F7C7-C74A-9F64-F223BE99F1E2}" => DEVICE_EXTENDED_RT[26] {
        [0] = SIZE_OF,
        [5] = device_get_attribute_ext(
            dev: cuda_types::cuda::CUdevice,
            attribute: std::ffi::c_uint,
            unknown: std::ffi::c_int,
            result: *mut [usize; 2]
        ) -> cuda_types::cuda::CUresult,
        // I don't know is this function return, but on my GTX 1060 it returns 0
        [13] = device_get_something(
            result: *mut std::ffi::c_uchar,
            dev: cuda_types::cuda::CUdevice
        ) -> cuda_types::cuda::CUresult
    },
    "{D4082055-BDE6-704B-8D34-BA123C66E1F2}" => INTEGRITY_CHECK[3] {
        [0] = SIZE_OF,
        [1] = integrity_check(
            version: u32,
            unix_seconds: u64,
            result: *mut [u64;2]
        ) -> cuda_types::cuda::CUresult
    },
    // This functions check for some bits that are never observably set
    "{263E8860-7CD2-6143-92F6-BBD5006DFA7E}" => UNKNOWN_CHECKS[4] {
        [0] = SIZE_OF,
        [2] = context_check(
            ctx_in: cuda_types::cuda::CUcontext,
            result1: *mut u32, // seems to be always 0
            result2: *mut *const std::ffi::c_void
        ) -> cuda_types::cuda::CUresult,
        [3] = check_fn3() -> u32 // seeems to always return 0
    }
 }
 // Purely for internal use by ZLUDA dump
 dark_api! {
    zluda_dump;
    "{0B7A5827-AF98-46AB-A951-22D19BDF5C08}" => ZLUDA_DUMP_INTERNAL[1] {
        #[noformat]
        [0] = logged_call(
            fn_name: cglue::slice::CSliceRef<'static, u8>,
            args: crate::FnFfiRef<crate::ByteVecFfi>,
            fn_: crate::FnFfiRef<usize>,
            internal_error: usize,
            format_status: extern "C" fn(usize) -> crate::ByteVecFfi
        ) -> usize
    }
 }
 #[repr(C)]
 pub struct ByteVecFfi {
    ptr: *mut u8,
    len: usize,
    capacity: usize,
 }
 impl ByteVecFfi {
    pub fn new(mut v: Vec<u8>) -> Self {
        let (ptr, len, capacity) = (v.as_mut_ptr(), v.len(), v.capacity());
        std::mem::forget(v);
        Self { ptr, len, capacity }
    }
    pub fn to_vec(self) -> Vec<u8> {
        let vec = unsafe { Vec::from_raw_parts(self.ptr, self.len, self.capacity) };
        std::mem::forget(self);
        vec
    }
 }
 impl Drop for ByteVecFfi {
    fn drop(&mut self) {
        // SAFETY: We are dropping the Vec<u8> that we created in `from`
        // and we know that the pointer is valid.
        unsafe {
            let _ = Vec::from_raw_parts(self.ptr, self.len, self.capacity);
        }
    }
 }
 #[cglue::cglue_trait]
 pub trait FnFfi {
    type Output;
    fn call(&self) -> Self::Output;
 }
 // We use this wrapper instead of implementing `FnFfi` for all T that implement `Fn() -> Output`
 // because cglue machinery already provided blanket implementation of `FnFfi` for its own needs
 // `cglue_trait_ext` does not work with `Fn` traits because they are special
 #[repr(transparent)]
 pub struct FnFfiWrapper<Output, T: std::ops::Fn() -> Output>(pub T);
 impl<Output, T: std::ops::Fn() -> Output> FnFfi for FnFfiWrapper<Output, T> {
    type Output = Output;
    fn call(&self) -> Output {
        (self.0)()
    }
 }
 pub fn integrity_check(
    version: u32,
    unix_seconds: u64,
    driver_version: u32,
    current_process: u32,
    current_thread: u32,
    integrity_check_table: *const c_void,
    cudart_table: *const c_void,
    fn_address: *const c_void,
    devices: u32,
    get_device: impl FnMut(u32) -> DeviceHashinfo,
 ) -> [u64; 2] {
    match version % 10 {
        0 => return [0x3341181C03CB675C, 0x8ED383AA1F4CD1E8],
        1 => return [0x1841181C03CB675C, 0x8ED383AA1F4CD1E8],
        _ => {}
    }
    // There's first computation pass, but it does not use any input and effectively computes this
    let pass1_result = [
        0x14u8, 0x6A, 0xDD, 0xAE, 0x53, 0xA9, 0xA7, 0x52, 0xAA, 0x08, 0x41, 0x36, 0x0B, 0xF5, 0x5A,
        0x9F,
    ];
    let mut result = [0u8; 66];
    pass2(&mut result, &pass1_result);
    let pass3_input = Pass3Input {
        driver_version,
        version,
        current_process,
        current_thread,
        cudart_table,
        integrity_check_table,
        fn_address,
        unix_seconds,
    };
    pass3(&mut result, &pass3_input);
    pass4(&mut result, devices, get_device);
    let pass5_1 = pass5(&mut result);
    zero_result(&mut result);
    pass6(&mut result, &pass1_result);
    pass7(&mut result, &pass5_1);
    pass5(&mut result)
 }
 fn pass7(accumulator: &mut [u8; 66], pass5_1: &[u64; 2]) {
    hash_pass(accumulator, pass5_1, 0);
 }
 fn pass6(accumulator: &mut [u8; 66], pass1_result: &[u8; 16]) {
    hash_pass(accumulator, pass1_result, 0x5c);
 }
 fn zero_result(result: &mut [u8; 66]) {
    for i in 0..16 {
        result[i] = 0;
    }
    for i in 48..66 {
        result[i] = 0;
    }
 }
 fn pass5(result: &mut [u8; 66]) -> [u64; 2] {
    let temp = 16u8.wrapping_sub(result[64]);
    for _ in 0..temp {
        integrity_check_single_pass(result, temp);
    }
    let mut temp_ptr = unsafe { result.as_mut_ptr().add(0x30) };
    loop {
        let temp = unsafe { *temp_ptr };
        temp_ptr = unsafe { temp_ptr.add(1) };
        integrity_check_single_pass(result, temp);
        if temp_ptr == unsafe { result.as_mut_ptr().add(0x40) } {
            break;
        }
    }
    [
        u64::from_ne_bytes(result[0..8].try_into().unwrap()),
        u64::from_ne_bytes(result[8..16].try_into().unwrap()),
    ]
 }
 #[repr(C)]
 struct Pass3Input {
    driver_version: u32,
    version: u32,
    current_process: u32,
    current_thread: u32,
    cudart_table: *const c_void,
    integrity_check_table: *const c_void,
    fn_address: *const c_void,
    unix_seconds: u64,
 }
 #[repr(C)]
 #[derive(Clone, Copy)]
 pub struct DeviceHashinfo {
    pub guid: CUuuid,
    pub pci_domain: i32,
    pub pci_bus: i32,
    pub pci_device: i32,
 }
 fn pass2(accumulator: &mut [u8; 66], pass1_result: &[u8; 16]) {
    hash_pass(accumulator, pass1_result, 0x36)
 }
 fn pass3(accumulator: &mut [u8; 66], mixin: &Pass3Input) {
    hash_pass(accumulator, mixin, 0)
 }
 fn pass4(
    accumulator: &mut [u8; 66],
    devices: u32,
    mut get_device: impl FnMut(u32) -> DeviceHashinfo,
 ) {
    for dev in 0..devices {
        hash_pass(accumulator, &(get_device)(dev), 0)
    }
 }
 fn hash_pass<T: Sized>(accumulator: &mut [u8; 66], mixin: &T, xor_mask: u8) {
    for i in 0..std::mem::size_of_val(mixin) {
        integrity_check_single_pass(
            accumulator,
            unsafe { *std::ptr::from_ref(mixin).cast::<u8>().add(i) } ^ xor_mask,
        );
    }
 }
 fn integrity_check_single_pass(arg1: &mut [u8; 66], arg2: u8) {
    const MIXING_TABLE: [u8; 256] = [
        0x29, 0x2E, 0x43, 0xC9, 0xA2, 0xD8, 0x7C, 0x01, 0x3D, 0x36, 0x54, 0xA1, 0xEC, 0xF0, 0x06,
        0x13, 0x62, 0xA7, 0x05, 0xF3, 0xC0, 0xC7, 0x73, 0x8C, 0x98, 0x93, 0x2B, 0xD9, 0xBC, 0x4C,
        0x82, 0xCA, 0x1E, 0x9B, 0x57, 0x3C, 0xFD, 0xD4, 0xE0, 0x16, 0x67, 0x42, 0x6F, 0x18, 0x8A,
        0x17, 0xE5, 0x12, 0xBE, 0x4E, 0xC4, 0xD6, 0xDA, 0x9E, 0xDE, 0x49, 0xA0, 0xFB, 0xF5, 0x8E,
        0xBB, 0x2F, 0xEE, 0x7A, 0xA9, 0x68, 0x79, 0x91, 0x15, 0xB2, 0x07, 0x3F, 0x94, 0xC2, 0x10,
        0x89, 0x0B, 0x22, 0x5F, 0x21, 0x80, 0x7F, 0x5D, 0x9A, 0x5A, 0x90, 0x32, 0x27, 0x35, 0x3E,
        0xCC, 0xE7, 0xBF, 0xF7, 0x97, 0x03, 0xFF, 0x19, 0x30, 0xB3, 0x48, 0xA5, 0xB5, 0xD1, 0xD7,
        0x5E, 0x92, 0x2A, 0xAC, 0x56, 0xAA, 0xC6, 0x4F, 0xB8, 0x38, 0xD2, 0x96, 0xA4, 0x7D, 0xB6,
        0x76, 0xFC, 0x6B, 0xE2, 0x9C, 0x74, 0x04, 0xF1, 0x45, 0x9D, 0x70, 0x59, 0x64, 0x71, 0x87,
        0x20, 0x86, 0x5B, 0xCF, 0x65, 0xE6, 0x2D, 0xA8, 0x02, 0x1B, 0x60, 0x25, 0xAD, 0xAE, 0xB0,
        0xB9, 0xF6, 0x1C, 0x46, 0x61, 0x69, 0x34, 0x40, 0x7E, 0x0F, 0x55, 0x47, 0xA3, 0x23, 0xDD,
        0x51, 0xAF, 0x3A, 0xC3, 0x5C, 0xF9, 0xCE, 0xBA, 0xC5, 0xEA, 0x26, 0x2C, 0x53, 0x0D, 0x6E,
        0x85, 0x28, 0x84, 0x09, 0xD3, 0xDF, 0xCD, 0xF4, 0x41, 0x81, 0x4D, 0x52, 0x6A, 0xDC, 0x37,
        0xC8, 0x6C, 0xC1, 0xAB, 0xFA, 0x24, 0xE1, 0x7B, 0x08, 0x0C, 0xBD, 0xB1, 0x4A, 0x78, 0x88,
        0x95, 0x8B, 0xE3, 0x63, 0xE8, 0x6D, 0xE9, 0xCB, 0xD5, 0xFE, 0x3B, 0x00, 0x1D, 0x39, 0xF2,
        0xEF, 0xB7, 0x0E, 0x66, 0x58, 0xD0, 0xE4, 0xA6, 0x77, 0x72, 0xF8, 0xEB, 0x75, 0x4B, 0x0A,
        0x31, 0x44, 0x50, 0xB4, 0x8F, 0xED, 0x1F, 0x1A, 0xDB, 0x99, 0x8D, 0x33, 0x9F, 0x11, 0x83,
        0x14,
    ];
    let temp1 = arg1[0x40];
    arg1[temp1 as usize + 0x10] = arg2;
    let temp2 = temp1 as usize;
    let temp3 = (temp1 + 1) & 0xf;
    arg1[temp1 as usize + 0x20] = arg1[temp2] ^ arg2;
    let temp4 = MIXING_TABLE[(arg2 ^ arg1[0x41]) as usize];
    let temp1 = arg1[temp2 + 0x30];
    arg1[temp2 + 0x30] = temp4 ^ temp1;
    arg1[0x41] = temp4 ^ temp1;
    arg1[0x40] = temp3;
    if temp3 != 0 {
        return;
    }
    let mut temp1 = 0x29;
    let mut temp5 = 0x0;
    unsafe {
        loop {
            temp1 = temp1 ^ arg1[0];
            arg1[0] = temp1;
            let mut temp6 = arg1.as_mut_ptr().add(1);
            loop {
                let temp7 = temp6.add(1);
                temp1 = *temp6 ^ MIXING_TABLE[temp1 as usize];
                *temp6 = temp1;
                temp6 = temp7;
                if temp7 == arg1.as_mut_ptr().add(0x30) {
                    break;
                }
            }
            temp1 = temp1.wrapping_add(temp5);
            temp5 = temp5.wrapping_add(0x01);
            if temp5 == 0x12 {
                break;
            }
            temp1 = MIXING_TABLE[temp1 as usize];
        }
    }
 }
 #[cfg(test)]
 mod tests {
    use std::mem;
    #[test]
    fn integrity_check_single_pass() {
        let mut input = [
            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
            0x00, 0x00, 0x38, 0xc0, 0x9b, 0xf7, 0xff, 0x7f, 0x00, 0x00, 0xa3, 0x61, 0xe4, 0x42,
            0xf6, 0x67, 0x94, 0xff, 0x18, 0xc0, 0x9b, 0xf7, 0xff, 0x7f, 0x00, 0x00, 0xa4, 0x57,
            0x72, 0xf7, 0xff, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        ];
        super::integrity_check_single_pass(&mut input, 34);
        let expected = [
            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
            0x00, 0x00, 0x22, 0xc0, 0x9b, 0xf7, 0xff, 0x7f, 0x00, 0x00, 0xa3, 0x61, 0xe4, 0x42,
            0xf6, 0x67, 0x94, 0xff, 0x22, 0xc0, 0x9b, 0xf7, 0xff, 0x7f, 0x00, 0x00, 0xa4, 0x57,
            0x72, 0xf7, 0xff, 0x7f, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x57,
        ];
        assert_eq!(input, expected);
    }
    #[test]
    fn integrity_check_pass2() {
        let pass1_result = [
            0x14u8, 0x6A, 0xDD, 0xAE, 0x53, 0xA9, 0xA7, 0x52, 0xAA, 0x08, 0x41, 0x36, 0x0B, 0xF5,
            0x5A, 0x9F,
        ];
        let mut result = [0u8; 66];
        super::pass2(&mut result, &pass1_result);
        let expected = [
            0x8b, 0x21, 0x9a, 0x49, 0xe8, 0x6d, 0x1a, 0xee, 0xf2, 0x37, 0xf9, 0xb5, 0x4a, 0x8c,
            0x3c, 0x75, 0xc7, 0x1e, 0xee, 0x21, 0xcf, 0x29, 0x8a, 0xe5, 0x13, 0x83, 0xf4, 0xec,
            0x33, 0x04, 0xe2, 0xfd, 0xb0, 0x2f, 0x09, 0x01, 0x4f, 0xf7, 0x68, 0x6d, 0x69, 0x46,
            0x43, 0x7e, 0xb6, 0x2b, 0x21, 0xed, 0x57, 0xa1, 0x10, 0x86, 0x0e, 0x60, 0x44, 0x1e,
            0x70, 0x5f, 0x67, 0xd1, 0xeb, 0x67, 0xa1, 0x3d, 0x00, 0x3d,
        ];
        assert_eq!(result, expected);
    }
    #[test]
    fn integrity_check_pass3() {
        let mut result = [
            0x8b, 0x21, 0x9a, 0x49, 0xe8, 0x6d, 0x1a, 0xee, 0xf2, 0x37, 0xf9, 0xb5, 0x4a, 0x8c,
            0x3c, 0x75, 0xc7, 0x1e, 0xee, 0x21, 0xcf, 0x29, 0x8a, 0xe5, 0x13, 0x83, 0xf4, 0xec,
            0x33, 0x04, 0xe2, 0xfd, 0xb0, 0x2f, 0x09, 0x01, 0x4f, 0xf7, 0x68, 0x6d, 0x69, 0x46,
            0x43, 0x7e, 0xb6, 0x2b, 0x21, 0xed, 0x57, 0xa1, 0x10, 0x86, 0x0e, 0x60, 0x44, 0x1e,
            0x70, 0x5f, 0x67, 0xd1, 0xeb, 0x67, 0xa1, 0x3d, 0x00, 0x3d,
        ];
        let input = super::Pass3Input {
            driver_version: 0x2f30,
            version: 12082,
            current_process: 0x002fa423,
            current_thread: 0xf79c1000,
            cudart_table: 0x00007ffff6958240 as *const _,
            integrity_check_table: 0x00007ffff6958220 as *const _,
            fn_address: 0x00007ffff2aaf4a0 as *const _,
            unix_seconds: 0x682b9cee,
        };
        super::pass3(&mut result, &input);
        let expected = [
            0x0a, 0xfd, 0xab, 0xc9, 0xff, 0x9b, 0xa0, 0xbe, 0x4d, 0x30, 0x32, 0x82, 0x74, 0x4f,
            0xa7, 0x48, 0x9d, 0x23, 0x82, 0xa3, 0x87, 0xfa, 0x6c, 0xdb, 0x92, 0x49, 0xd9, 0xb5,
            0x4b, 0x2b, 0x5e, 0x51, 0x6e, 0xf7, 0xf9, 0x4d, 0x28, 0x8a, 0x64, 0x06, 0x19, 0xb3,
            0xe6, 0xbe, 0xa4, 0xec, 0x7e, 0x54, 0x64, 0x28, 0xd9, 0xe1, 0xd4, 0x34, 0xc0, 0xa9,
            0x49, 0x88, 0xc9, 0x61, 0x58, 0xdd, 0x66, 0x74, 0x00, 0x74,
        ];
        assert_eq!(result, expected);
    }
    #[test]
    fn integrity_check_pass4() {
        let mut result = [
            0x84, 0xfd, 0x93, 0x10, 0xc6, 0xdb, 0xb3, 0xbc, 0x49, 0xc2, 0x25, 0xe7, 0xda, 0x6e,
            0x22, 0x6f, 0x9b, 0xbd, 0x81, 0x59, 0xc3, 0x01, 0x9a, 0x7a, 0x26, 0x34, 0x39, 0x0f,
            0x2a, 0x56, 0x13, 0xb1, 0xf6, 0xbc, 0x7f, 0xa1, 0x8f, 0x04, 0xa5, 0x4d, 0x0d, 0x78,
            0xab, 0x20, 0xf8, 0x23, 0x20, 0xa5, 0x3f, 0x67, 0x36, 0xe2, 0xde, 0x8a, 0xe5, 0xdf,
            0xe1, 0xf2, 0x03, 0x94, 0xad, 0xdc, 0x9a, 0xda, 0x00, 0xda,
        ];
        super::pass4(&mut result, 1, |_| super::DeviceHashinfo {
            guid: super::CUuuid {
                bytes: unsafe {
                    std::mem::transmute([0x8a2bfe9au32, 0x382d25ac, 0xc5ae37ea, 0x5f32716d])
                },
            },
            pci_domain: 0,
            pci_bus: 2,
            pci_device: 0,
        });
        let expected = [
            0x1f, 0xd8, 0x25, 0xd2, 0xdf, 0xfa, 0x64, 0xc7, 0xb6, 0x1a, 0xaf, 0x22, 0xb8, 0x79,
            0xfb, 0x96, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
            0x7c, 0x9d, 0x46, 0xd2, 0x1f, 0xd8, 0x25, 0xd2, 0xdd, 0xfa, 0x64, 0xc7, 0xb6, 0x1a,
            0xaf, 0x22, 0xe6, 0x17, 0xbd, 0x3a, 0xd7, 0xdd, 0x5f, 0x82, 0x8c, 0x87, 0xce, 0x86,
            0x66, 0xaf, 0xa0, 0x50, 0x7a, 0x7d, 0xbb, 0xbc, 0x0c, 0x50,
        ];
        assert_eq!(result, expected);
    }
    #[test]
    fn integrity_check_pass5() {
        let mut result = [
            0x3e, 0x4b, 0xf2, 0x95, 0x71, 0xf5, 0x6b, 0x51, 0x07, 0xbf, 0x4b, 0xf1, 0x04, 0x0e,
            0x8e, 0x0b, 0x5f, 0x4d, 0x30, 0x0c, 0x0f, 0x0c, 0xae, 0xfb, 0x48, 0xaf, 0x23, 0xb5,
            0xea, 0x4c, 0xc2, 0xdb, 0xd7, 0xdf, 0x88, 0x74, 0x39, 0x58, 0x16, 0x3a, 0x1f, 0x7c,
            0x9b, 0x20, 0x7e, 0x7e, 0x94, 0xc8, 0x8b, 0xc6, 0xb2, 0x38, 0x0d, 0x07, 0x7d, 0xbd,
            0x90, 0xd5, 0x39, 0x63, 0xeb, 0x1d, 0x4f, 0x40, 0x00, 0x40,
        ];
        let output = super::pass5(&mut result);
        let expected_result = [
            0x00, 0x23, 0x53, 0x06, 0x5e, 0x96, 0xf6, 0x9c, 0x61, 0xaa, 0x96, 0x2d, 0x2e, 0xcd,
            0xa8, 0x58, 0xe9, 0xca, 0xc0, 0x2e, 0x35, 0xed, 0x5f, 0xca, 0xe1, 0x0e, 0xcd, 0x1f,
            0xd0, 0x8e, 0x8b, 0x9c, 0x29, 0x4d, 0x1c, 0x94, 0x6b, 0xf7, 0x10, 0xb0, 0x07, 0x08,
            0x91, 0xd6, 0x14, 0x06, 0xc0, 0xec, 0xe1, 0x9c, 0x8e, 0x33, 0xd4, 0xe9, 0x43, 0x5c,
            0x86, 0x0c, 0x72, 0x4d, 0x27, 0x98, 0x91, 0x7f, 0x00, 0x7f,
        ];
        assert_eq!(result, expected_result);
        let output = unsafe { mem::transmute::<_, [u8; 16]>(output) };
        let expected = [
            0x00, 0x23, 0x53, 0x06, 0x5e, 0x96, 0xf6, 0x9c, 0x61, 0xaa, 0x96, 0x2d, 0x2e, 0xcd,
            0xa8, 0x58,
        ];
        assert_eq!(output, expected);
    }
 }
--- a/detours-sys/Cargo.toml
+++ b/detours-sys/Cargo.toml
@ -2,7 +2,7 @@
 name = "detours-sys"
 version = "0.1.2"
 authors = ["Diana <5275194+DianaNites@users.noreply.github.com>"]
-edition = "2018"
+edition = "2021"
 links = "detours"
 # Package stuff
 description = "Rust bindings to Microsoft Detours"
--- a/ext/amd_comgr-sys/Cargo.toml
+++ b/ext/amd_comgr-sys/Cargo.toml
@ -3,6 +3,8 @@ name = "amd_comgr-sys"
 version = "0.0.0"
 authors = ["Andrzej Janik <vosen@vosen.pl>"]
 edition = "2021"
 links = "amd_comgr"
-[lib]
+[lib]
 [dependencies]
 libloading = "0.8"
--- a/ext/amd_comgr-sys/README
+++ b/ext/amd_comgr-sys/README
@ -1 +1,4 @@
-bindgen --rust-target 1.77 /opt/rocm/include/amd_comgr/amd_comgr.h -o /tmp/amd_comgr.rs --no-layout-tests --default-enum-style=newtype --allowlist-function "amd_comgr.*" --allowlist-type "amd_comgr.*" --no-derive-debug --must-use-type amd_comgr_status_t --allowlist-var "^AMD_COMGR.*$"
+# On ROCm 6.3 and lower
 bindgen --rust-target 1.77 /opt/rocm-6.3.4/include/amd_comgr/amd_comgr.h -o src/comgr2.rs --no-layout-tests --default-enum-style=newtype --no-derive-debug --must-use-type amd_comgr_status_t --allowlist-var "^AMD_COMGR.*$" --dynamic-loading Comgr2 --allowlist-function amd_comgr_do_action --allowlist-function amd_comgr_action_data_get_data --allowlist-function amd_comgr_action_info_set_isa_name --allowlist-function amd_comgr_action_info_set_language --allowlist-function amd_comgr_create_action_info --allowlist-function amd_comgr_create_data --allowlist-function amd_comgr_create_data_set --allowlist-function amd_comgr_data_set_add --allowlist-function amd_comgr_destroy_action_info --allowlist-function amd_comgr_destroy_data_set --allowlist-function amd_comgr_get_data --allowlist-function amd_comgr_set_data --allowlist-function amd_comgr_set_data_name --allowlist-function amd_comgr_action_info_set_option_list --allowlist-function amd_comgr_get_version
 # On ROCm 6.4 and higher
 bindgen --rust-target 1.77 /opt/rocm/include/amd_comgr/amd_comgr.h -o src/comgr3.rs --no-layout-tests --default-enum-style=newtype --no-derive-debug --must-use-type amd_comgr_status_t --allowlist-var "^AMD_COMGR.*$" --dynamic-loading Comgr3 --allowlist-function amd_comgr_do_action --allowlist-function amd_comgr_action_data_get_data --allowlist-function amd_comgr_action_info_set_isa_name --allowlist-function amd_comgr_action_info_set_language --allowlist-function amd_comgr_create_action_info --allowlist-function amd_comgr_create_data --allowlist-function amd_comgr_create_data_set --allowlist-function amd_comgr_data_set_add --allowlist-function amd_comgr_destroy_action_info --allowlist-function amd_comgr_destroy_data_set --allowlist-function amd_comgr_get_data --allowlist-function amd_comgr_set_data --allowlist-function amd_comgr_set_data_name --allowlist-function amd_comgr_action_info_set_option_list --allowlist-function amd_comgr_get_version
--- a/ext/amd_comgr-sys/build.rs
+++ b/ext/amd_comgr-sys/build.rs
@ -1,20 +0,0 @@
 use std::env::VarError;
 use std::{env, path::PathBuf};
 fn main() -> Result<(), VarError> {
    if cfg!(windows) {
        println!("cargo:rustc-link-lib=dylib=amd_comgr_2");
        let env = env::var("CARGO_CFG_TARGET_ENV")?;
        if env == "msvc" {
            let mut path = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?);
            path.push("lib");
            println!("cargo:rustc-link-search=native={}", path.display());
        } else {
            println!("cargo:rustc-link-search=native=C:\\Windows\\System32");
        };
    } else {
        println!("cargo:rustc-link-lib=dylib=amd_comgr");
        println!("cargo:rustc-link-search=native=/opt/rocm/lib/");
    }
    Ok(())
 }
--- a/ext/amd_comgr-sys/src/amd_comgr.rs
+++ b/ext/amd_comgr-sys/src/amd_comgr.rs
@ -1,941 +0,0 @@
 /* automatically generated by rust-bindgen 0.70.1 */
 pub const AMD_COMGR_INTERFACE_VERSION_MAJOR: u32 = 2;
 pub const AMD_COMGR_INTERFACE_VERSION_MINOR: u32 = 7;
 impl amd_comgr_status_s {
    #[doc = " The function has been executed successfully."]
    pub const AMD_COMGR_STATUS_SUCCESS: amd_comgr_status_s =
        amd_comgr_status_s(unsafe { ::std::num::NonZeroU32::new_unchecked(0) });
 }
 impl amd_comgr_status_s {
    #[doc = " A generic error has occurred."]
    pub const AMD_COMGR_STATUS_ERROR: amd_comgr_status_s =
        amd_comgr_status_s(unsafe { ::std::num::NonZeroU32::new_unchecked(1) });
 }
 impl amd_comgr_status_s {
    #[doc = " One of the actual arguments does not meet a precondition stated\n in the documentation of the corresponding formal argument. This\n includes both invalid Action types, and invalid arguments to\n valid Action types."]
    pub const AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT: amd_comgr_status_s =
        amd_comgr_status_s(unsafe { ::std::num::NonZeroU32::new_unchecked(2) });
 }
 impl amd_comgr_status_s {
    #[doc = " Failed to allocate the necessary resources."]
    pub const AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES: amd_comgr_status_s =
        amd_comgr_status_s(unsafe { ::std::num::NonZeroU32::new_unchecked(3) });
 }
 #[repr(transparent)]
 #[doc = " @brief Status codes."]
 #[derive(Copy, Clone, Hash, PartialEq, Eq, Debug)]
 pub struct amd_comgr_status_s(pub ::std::num::NonZeroU32);
 type amd_comgr_status_t = Result<(), self::amd_comgr_status_s>;
 // Size check
 const _: fn() = || {
    let _ = std::mem::transmute::<amd_comgr_status_t, u32>;
 };
 impl amd_comgr_language_s {
    #[doc = " No high level language."]
    pub const AMD_COMGR_LANGUAGE_NONE: amd_comgr_language_s = amd_comgr_language_s(0);
 }
 impl amd_comgr_language_s {
    #[doc = " OpenCL 1.2."]
    pub const AMD_COMGR_LANGUAGE_OPENCL_1_2: amd_comgr_language_s = amd_comgr_language_s(1);
 }
 impl amd_comgr_language_s {
    #[doc = " OpenCL 2.0."]
    pub const AMD_COMGR_LANGUAGE_OPENCL_2_0: amd_comgr_language_s = amd_comgr_language_s(2);
 }
 impl amd_comgr_language_s {
    #[doc = " AMD Hetrogeneous C++ (HC)."]
    pub const AMD_COMGR_LANGUAGE_HC: amd_comgr_language_s = amd_comgr_language_s(3);
 }
 impl amd_comgr_language_s {
    #[doc = " HIP."]
    pub const AMD_COMGR_LANGUAGE_HIP: amd_comgr_language_s = amd_comgr_language_s(4);
 }
 impl amd_comgr_language_s {
    #[doc = " LLVM IR, either textual (.ll) or bitcode (.bc) format."]
    pub const AMD_COMGR_LANGUAGE_LLVM_IR: amd_comgr_language_s = amd_comgr_language_s(5);
 }
 impl amd_comgr_language_s {
    #[doc = " Marker for last valid language."]
    pub const AMD_COMGR_LANGUAGE_LAST: amd_comgr_language_s = amd_comgr_language_s(5);
 }
 #[repr(transparent)]
 #[doc = " @brief The source languages supported by the compiler."]
 #[derive(Copy, Clone, Hash, PartialEq, Eq)]
 pub struct amd_comgr_language_s(pub ::std::os::raw::c_uint);
 #[doc = " @brief The source languages supported by the compiler."]
 pub use self::amd_comgr_language_s as amd_comgr_language_t;
 extern "C" {
    #[must_use]
    #[doc = " @brief Query additional information about a status code.\n\n @param[in] status Status code.\n\n @param[out] status_string A NUL-terminated string that describes\n the error status.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n status is an invalid status code, or @p status_string is NULL."]
    pub fn amd_comgr_status_string(
        status: amd_comgr_status_t,
        status_string: *mut *const ::std::os::raw::c_char,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[doc = " @brief Get the version of the code object manager interface\n supported.\n\n An interface is backwards compatible with an implementation with an\n equal major version, and a greater than or equal minor version.\n\n @param[out] major Major version number.\n\n @param[out] minor Minor version number."]
    pub fn amd_comgr_get_version(major: *mut usize, minor: *mut usize);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " No data is available."]
    pub const AMD_COMGR_DATA_KIND_UNDEF: amd_comgr_data_kind_s = amd_comgr_data_kind_s(0);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a textual main source."]
    pub const AMD_COMGR_DATA_KIND_SOURCE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(1);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a textual source that is included in the main source\n or other include source."]
    pub const AMD_COMGR_DATA_KIND_INCLUDE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(2);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a precompiled-header source that is included in the main\n source or other include source."]
    pub const AMD_COMGR_DATA_KIND_PRECOMPILED_HEADER: amd_comgr_data_kind_s =
        amd_comgr_data_kind_s(3);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a diagnostic output."]
    pub const AMD_COMGR_DATA_KIND_DIAGNOSTIC: amd_comgr_data_kind_s = amd_comgr_data_kind_s(4);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a textual log output."]
    pub const AMD_COMGR_DATA_KIND_LOG: amd_comgr_data_kind_s = amd_comgr_data_kind_s(5);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is compiler LLVM IR bit code for a specific isa."]
    pub const AMD_COMGR_DATA_KIND_BC: amd_comgr_data_kind_s = amd_comgr_data_kind_s(6);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a relocatable machine code object for a specific isa."]
    pub const AMD_COMGR_DATA_KIND_RELOCATABLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(7);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is an executable machine code object for a specific\n isa. An executable is the kind of code object that can be loaded\n and executed."]
    pub const AMD_COMGR_DATA_KIND_EXECUTABLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(8);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a block of bytes."]
    pub const AMD_COMGR_DATA_KIND_BYTES: amd_comgr_data_kind_s = amd_comgr_data_kind_s(9);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a fat binary (clang-offload-bundler output)."]
    pub const AMD_COMGR_DATA_KIND_FATBIN: amd_comgr_data_kind_s = amd_comgr_data_kind_s(16);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is an archive."]
    pub const AMD_COMGR_DATA_KIND_AR: amd_comgr_data_kind_s = amd_comgr_data_kind_s(17);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a bundled bitcode."]
    pub const AMD_COMGR_DATA_KIND_BC_BUNDLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(18);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a bundled archive."]
    pub const AMD_COMGR_DATA_KIND_AR_BUNDLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(19);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " Marker for last valid data kind."]
    pub const AMD_COMGR_DATA_KIND_LAST: amd_comgr_data_kind_s = amd_comgr_data_kind_s(19);
 }
 #[repr(transparent)]
 #[doc = " @brief The kinds of data supported."]
 #[derive(Copy, Clone, Hash, PartialEq, Eq)]
 pub struct amd_comgr_data_kind_s(pub ::std::os::raw::c_uint);
 #[doc = " @brief The kinds of data supported."]
 pub use self::amd_comgr_data_kind_s as amd_comgr_data_kind_t;
 #[doc = " @brief A handle to a data object.\n\n Data objects are used to hold the data which is either an input or\n output of a code object manager action."]
 #[repr(C)]
 #[derive(Copy, Clone)]
 pub struct amd_comgr_data_s {
    pub handle: u64,
 }
 #[doc = " @brief A handle to a data object.\n\n Data objects are used to hold the data which is either an input or\n output of a code object manager action."]
 pub type amd_comgr_data_t = amd_comgr_data_s;
 #[doc = " @brief A handle to an action data object.\n\n An action data object holds a set of data objects. These can be\n used as inputs to an action, or produced as the result of an\n action."]
 #[repr(C)]
 #[derive(Copy, Clone)]
 pub struct amd_comgr_data_set_s {
    pub handle: u64,
 }
 #[doc = " @brief A handle to an action data object.\n\n An action data object holds a set of data objects. These can be\n used as inputs to an action, or produced as the result of an\n action."]
 pub type amd_comgr_data_set_t = amd_comgr_data_set_s;
 #[doc = " @brief A handle to an action information object.\n\n An action information object holds all the necessary information,\n excluding the input data objects, required to perform an action."]
 #[repr(C)]
 #[derive(Copy, Clone)]
 pub struct amd_comgr_action_info_s {
    pub handle: u64,
 }
 #[doc = " @brief A handle to an action information object.\n\n An action information object holds all the necessary information,\n excluding the input data objects, required to perform an action."]
 pub type amd_comgr_action_info_t = amd_comgr_action_info_s;
 #[doc = " @brief A handle to a metadata node.\n\n A metadata node handle is used to traverse the metadata associated\n with a data node."]
 #[repr(C)]
 #[derive(Copy, Clone)]
 pub struct amd_comgr_metadata_node_s {
    pub handle: u64,
 }
 #[doc = " @brief A handle to a metadata node.\n\n A metadata node handle is used to traverse the metadata associated\n with a data node."]
 pub type amd_comgr_metadata_node_t = amd_comgr_metadata_node_s;
 #[doc = " @brief A handle to a machine code object symbol.\n\n A symbol handle is used to obtain the properties of symbols of a machine code\n object. A symbol handle is invalidated when the data object containing the\n symbol is destroyed."]
 #[repr(C)]
 #[derive(Copy, Clone)]
 pub struct amd_comgr_symbol_s {
    pub handle: u64,
 }
 #[doc = " @brief A handle to a machine code object symbol.\n\n A symbol handle is used to obtain the properties of symbols of a machine code\n object. A symbol handle is invalidated when the data object containing the\n symbol is destroyed."]
 pub type amd_comgr_symbol_t = amd_comgr_symbol_s;
 #[doc = " @brief A handle to a disassembly information object.\n\n A disassembly information object holds all the necessary information,\n excluding the input data, required to perform disassembly."]
 #[repr(C)]
 #[derive(Copy, Clone)]
 pub struct amd_comgr_disassembly_info_s {
    pub handle: u64,
 }
 #[doc = " @brief A handle to a disassembly information object.\n\n A disassembly information object holds all the necessary information,\n excluding the input data, required to perform disassembly."]
 pub type amd_comgr_disassembly_info_t = amd_comgr_disassembly_info_s;
 #[doc = " @brief A handle to a symbolizer information object.\n\n A symbolizer information object holds all the necessary information\n required to perform symbolization."]
 #[repr(C)]
 #[derive(Copy, Clone)]
 pub struct amd_comgr_symbolizer_info_s {
    pub handle: u64,
 }
 #[doc = " @brief A handle to a symbolizer information object.\n\n A symbolizer information object holds all the necessary information\n required to perform symbolization."]
 pub type amd_comgr_symbolizer_info_t = amd_comgr_symbolizer_info_s;
 extern "C" {
    #[must_use]
    #[doc = " @brief Return the number of isa names supported by this version of\n the code object manager library.\n\n The isa name specifies the instruction set architecture that should\n be used in the actions that involve machine code generation or\n inspection.\n\n @param[out] count The number of isa names supported.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n count is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update action info object as out of resources."]
    pub fn amd_comgr_get_isa_count(count: *mut usize) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Return the Nth isa name supported by this version of the\n code object manager library.\n\n @param[in] index The index of the isa name to be returned. The\n first isa name is index 0.\n\n @param[out] isa_name A null terminated string that is the isa name\n being requested.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n index is greater than the number of isa name supported by this\n version of the code object manager library. @p isa_name is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update action info object as out of resources."]
    pub fn amd_comgr_get_isa_name(
        index: usize,
        isa_name: *mut *const ::std::os::raw::c_char,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Get a handle to the metadata of an isa name.\n\n The structure of the returned metadata is isa name specific and versioned\n with details specified in\n https://llvm.org/docs/AMDGPUUsage.html#code-object-metadata.\n It can include information about the\n limits for resources such as registers and memory addressing.\n\n @param[in] isa_name The isa name to query.\n\n @param[out] metadata A handle to the metadata of the isa name. If\n the isa name has no metadata then the returned handle has a kind of\n @p AMD_COMGR_METADATA_KIND_NULL. The handle must be destroyed\n using @c amd_comgr_destroy_metadata.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n name is NULL or is not an isa name supported by this version of the\n code object manager library. @p metadata is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub fn amd_comgr_get_isa_metadata(
        isa_name: *const ::std::os::raw::c_char,
        metadata: *mut amd_comgr_metadata_node_t,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Create a data object that can hold data of a specified kind.\n\n Data objects are reference counted and are destroyed when the\n reference count reaches 0. When a data object is created its\n reference count is 1, it has 0 bytes of data, it has an empty name,\n and it has no metadata.\n\n @param[in] kind The kind of data the object is intended to hold.\n\n @param[out] data A handle to the data object created. Its reference\n count is set to 1.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n kind is an invalid data kind, or @p\n AMD_COMGR_DATA_KIND_UNDEF. @p data is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to create the data object as out of resources."]
    pub fn amd_comgr_create_data(
        kind: amd_comgr_data_kind_t,
        data: *mut amd_comgr_data_t,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Indicate that no longer using a data object handle.\n\n The reference count of the associated data object is\n decremented. If it reaches 0 it is destroyed.\n\n @param[in] data The data object to release.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n data is an invalid data object, or has kind @p\n AMD_COMGR_DATA_KIND_UNDEF.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub fn amd_comgr_release_data(data: amd_comgr_data_t) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Get the kind of the data object.\n\n @param[in] data The data object to query.\n\n @param[out] kind The kind of data the object.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n data is an invalid data object. @p kind is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to create the data object as out of resources."]
    pub fn amd_comgr_get_data_kind(
        data: amd_comgr_data_t,
        kind: *mut amd_comgr_data_kind_t,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Set the data content of a data object to the specified\n bytes.\n\n Any previous value of the data object is overwritten. Any metadata\n associated with the data object is also replaced which invalidates\n all metadata handles to the old metadata.\n\n @param[in] data The data object to update.\n\n @param[in] size The number of bytes in the data specified by @p bytes.\n\n @param[in] bytes The bytes to set the data object to. The bytes are\n copied into the data object and can be freed after the call.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n data is an invalid data object, or has kind @p\n AMD_COMGR_DATA_KIND_UNDEF.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub fn amd_comgr_set_data(
        data: amd_comgr_data_t,
        size: usize,
        bytes: *const ::std::os::raw::c_char,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief For the given open posix file descriptor, map a slice of the\n file into the data object. The slice is specified by @p offset and @p size.\n Internally this API calls amd_comgr_set_data and resets data object's\n current state.\n\n @param[in, out] data The data object to update.\n\n @param[in] file_descriptor The native file descriptor for an open file.\n The @p file_descriptor must not be passed into a system I/O function\n by any other thread while this function is executing.  The offset in\n the file descriptor may be updated based on the requested size and\n underlying platform. The @p file_descriptor may be closed immediately\n after this function returns.\n\n @param[in] offset position relative to the start of the file\n specifying the beginning of the slice in @p file_descriptor.\n\n @param[in] size Size in bytes of the slice.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The operation is successful.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data is an invalid or\n the map operation failed."]
    pub fn amd_comgr_set_data_from_file_slice(
        data: amd_comgr_data_t,
        file_descriptor: ::std::os::raw::c_int,
        offset: u64,
        size: u64,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Set the name associated with a data object.\n\n When compiling, the full name of an include directive is used to\n reference the contents of the include data object with the same\n name. The name may also be used for other data objects in log and\n diagnostic output.\n\n @param[in] data The data object to update.\n\n @param[in] name A null terminated string that specifies the name to\n use for the data object. If NULL then the name is set to the empty\n string.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n data is an invalid data object, or has kind @p\n AMD_COMGR_DATA_KIND_UNDEF.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub fn amd_comgr_set_data_name(
        data: amd_comgr_data_t,
        name: *const ::std::os::raw::c_char,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Get the data contents, and/or the size of the data\n associated with a data object.\n\n @param[in] data The data object to query.\n\n @param[in, out] size On entry, the size of @p bytes. On return, if @p bytes\n is NULL, set to the size of the data object contents.\n\n @param[out] bytes If not NULL, then the first @p size bytes of the\n data object contents is copied. If NULL, no data is copied, and\n only @p size is updated (useful in order to find the size of buffer\n required to copy the data).\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n data is an invalid data object, or has kind @p\n AMD_COMGR_DATA_KIND_UNDEF. @p size is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub fn amd_comgr_get_data(
        data: amd_comgr_data_t,
        size: *mut usize,
        bytes: *mut ::std::os::raw::c_char,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Get the data object name and/or name length.\n\n @param[in] data The data object to query.\n\n @param[in, out] size On entry, the size of @p name. On return, the size of\n the data object name including the terminating null character.\n\n @param[out] name If not NULL, then the first @p size characters of the\n data object name are copied. If @p name is NULL, only @p size is updated\n (useful in order to find the size of buffer required to copy the name).\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n data is an invalid data object, or has kind @p\n AMD_COMGR_DATA_KIND_UNDEF. @p size is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub fn amd_comgr_get_data_name(
        data: amd_comgr_data_t,
        size: *mut usize,
        name: *mut ::std::os::raw::c_char,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Get the data object isa name and/or isa name length.\n\n @param[in] data The data object to query.\n\n @param[in, out] size On entry, the size of @p isa_name. On return, if @p\n isa_name is NULL, set to the size of the isa name including the terminating\n null character.\n\n @param[out] isa_name If not NULL, then the first @p size characters\n of the isa name are copied. If NULL, no isa name is copied, and\n only @p size is updated (useful in order to find the size of buffer\n required to copy the isa name).\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n data is an invalid data object, has kind @p\n AMD_COMGR_DATA_KIND_UNDEF, or is not an isa specific\n kind. @p size is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub fn amd_comgr_get_data_isa_name(
        data: amd_comgr_data_t,
        size: *mut usize,
        isa_name: *mut ::std::os::raw::c_char,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Create a symbolizer info object.\n\n @param[in] code_object A data object denoting a code object for which\n symbolization should be performed. The kind of this object must be\n ::AMD_COMGR_DATA_KIND_RELOCATABLE, ::AMD_COMGR_DATA_KIND_EXECUTABLE,\n or ::AMD_COMGR_DATA_KIND_BYTES.\n\n @param[in] print_symbol_callback Function called by a successfull\n symbolize query. @p symbol is a null-terminated string containing the\n symbolization of the address and @p user_data is an arbitary user data.\n The callback does not own @p symbol, and it cannot be referenced once\n the callback returns.\n\n @param[out] symbolizer_info A handle to the symbolizer info object created.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed\n successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if @p code_object is\n invalid or @p print_symbol_callback is null.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to create @p symbolizer_info as out of resources."]
    pub fn amd_comgr_create_symbolizer_info(
        code_object: amd_comgr_data_t,
        print_symbol_callback: ::std::option::Option<
            unsafe extern "C" fn(
                symbol: *const ::std::os::raw::c_char,
                user_data: *mut ::std::os::raw::c_void,
            ),
        >,
        symbolizer_info: *mut amd_comgr_symbolizer_info_t,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Destroy symbolizer info object.\n\n @param[in] symbolizer_info A handle to symbolizer info object to destroy.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS on successful execution.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if @p\n symbolizer_info is invalid."]
    pub fn amd_comgr_destroy_symbolizer_info(
        symbolizer_info: amd_comgr_symbolizer_info_t,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Symbolize an address.\n\n The @p address is symbolized using the symbol definitions of the\n @p code_object specified when the @p symbolizer_info was created.\n The @p print_symbol_callback callback function specified when the\n @p symbolizer_info was created is called passing the\n symbolization result as @p symbol and @p user_data value.\n\n If symbolization is not possible ::AMD_COMGR_STATUS_SUCCESS is returned and\n the string passed to the @p symbol argument of the @p print_symbol_callback\n specified when the @p symbolizer_info was created contains the text\n \"<invalid>\" or \"??\". This is consistent with `llvm-symbolizer` utility.\n\n @param[in] symbolizer_info A handle to symbolizer info object which should be\n used to symbolize the @p address.\n\n @param[in] address An unrelocated ELF address to which symbolization\n query should be performed.\n\n @param[in] is_code if true, the symbolizer symbolize the address as code\n and the symbolization result contains filename, function name, line number\n and column number, else the symbolizer symbolize the address as data and\n the symbolizaion result contains symbol name, symbol's starting address\n and symbol size.\n\n @param[in] user_data Arbitrary user-data passed to @p print_symbol_callback\n callback as described for @p symbolizer_info argument.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n symbolizer_info is an invalid data object."]
    pub fn amd_comgr_symbolize(
        symbolizer_info: amd_comgr_symbolizer_info_t,
        address: u64,
        is_code: bool,
        user_data: *mut ::std::os::raw::c_void,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Get a handle to the metadata of a data object.\n\n @param[in] data The data object to query.\n\n @param[out] metadata A handle to the metadata of the data\n object. If the data object has no metadata then the returned handle\n has a kind of @p AMD_COMGR_METADATA_KIND_NULL. The\n handle must be destroyed using @c amd_comgr_destroy_metadata.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n data is an invalid data object, or has kind @p\n AMD_COMGR_DATA_KIND_UNDEF. @p metadata is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub fn amd_comgr_get_data_metadata(
        data: amd_comgr_data_t,
        metadata: *mut amd_comgr_metadata_node_t,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Destroy a metadata handle.\n\n @param[in] metadata A metadata handle to destroy.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed\n successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p metadata is an invalid\n metadata handle.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to update metadata\n handle as out of resources."]
    pub fn amd_comgr_destroy_metadata(metadata: amd_comgr_metadata_node_t) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Create a data set object.\n\n @param[out] data_set A handle to the data set created. Initially it\n contains no data objects.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed\n successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data_set is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to create the data\n set object as out of resources."]
    pub fn amd_comgr_create_data_set(data_set: *mut amd_comgr_data_set_t) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Destroy a data set object.\n\n The reference counts of any associated data objects are decremented. Any\n handles to the data set object become invalid.\n\n @param[in] data_set A handle to the data set object to destroy.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed\n successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data_set is an invalid\n data set object.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to update data set\n object as out of resources."]
    pub fn amd_comgr_destroy_data_set(data_set: amd_comgr_data_set_t) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Add a data object to a data set object if it is not already added.\n\n The reference count of the data object is incremented.\n\n @param[in] data_set A handle to the data set object to be updated.\n\n @param[in] data A handle to the data object to be added. If @p data_set\n already has the specified handle present, then it is not added. The order\n that data objects are added is preserved.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed\n successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data_set is an invalid\n data set object. @p data is an invalid data object; has undef kind; has\n include kind but does not have a name.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to update data set\n object as out of resources."]
    pub fn amd_comgr_data_set_add(
        data_set: amd_comgr_data_set_t,
        data: amd_comgr_data_t,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Remove all data objects of a specified kind from a data set object.\n\n The reference count of the removed data objects is decremented.\n\n @param[in] data_set A handle to the data set object to be updated.\n\n @param[in] data_kind The data kind of the data objects to be removed. If @p\n AMD_COMGR_DATA_KIND_UNDEF is specified then all data objects are removed.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed\n successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data_set is an invalid\n data set object. @p data_kind is an invalid data kind.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to update data set\n object as out of resources."]
    pub fn amd_comgr_data_set_remove(
        data_set: amd_comgr_data_set_t,
        data_kind: amd_comgr_data_kind_t,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Return the number of data objects of a specified data kind that are\n added to a data set object.\n\n @param[in] data_set A handle to the data set object to be queried.\n\n @param[in] data_kind The data kind of the data objects to be counted.\n\n @param[out] count The number of data objects of data kind @p data_kind.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed\n successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data_set is an invalid\n data set object. @p data_kind is an invalid data kind or @p\n AMD_COMGR_DATA_KIND_UNDEF. @p count is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to query data set\n object as out of resources."]
    pub fn amd_comgr_action_data_count(
        data_set: amd_comgr_data_set_t,
        data_kind: amd_comgr_data_kind_t,
        count: *mut usize,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Return the Nth data object of a specified data kind that is added to a\n data set object.\n\n The reference count of the returned data object is incremented.\n\n @param[in] data_set A handle to the data set object to be queried.\n\n @param[in] data_kind The data kind of the data object to be returned.\n\n @param[in] index The index of the data object of data kind @data_kind to be\n returned. The first data object is index 0. The order of data objects matches\n the order that they were added to the data set object.\n\n @param[out] data The data object being requested.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed\n successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data_set is an invalid\n data set object. @p data_kind is an invalid data kind or @p\n AMD_COMGR_DATA_KIND_UNDEF. @p index is greater than the number of data\n objects of kind @p data_kind. @p data is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to query data set\n object as out of resources."]
    pub fn amd_comgr_action_data_get_data(
        data_set: amd_comgr_data_set_t,
        data_kind: amd_comgr_data_kind_t,
        index: usize,
        data: *mut amd_comgr_data_t,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Create an action info object.\n\n @param[out] action_info A handle to the action info object created.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n action_info is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to create the action info object as out of resources."]
    pub fn amd_comgr_create_action_info(
        action_info: *mut amd_comgr_action_info_t,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Destroy an action info object.\n\n @param[in] action_info A handle to the action info object to destroy.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n action_info is an invalid action info object.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update action info object as out of resources."]
    pub fn amd_comgr_destroy_action_info(
        action_info: amd_comgr_action_info_t,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Set the isa name of an action info object.\n\n When an action info object is created it has no isa name. Some\n actions require that the action info object has an isa name\n defined.\n\n @param[in] action_info A handle to the action info object to be\n updated.\n\n @param[in] isa_name A null terminated string that is the isa name. If NULL\n or the empty string then the isa name is cleared. The isa name is defined as\n the Code Object Target Identification string, described at\n https://llvm.org/docs/AMDGPUUsage.html#code-object-target-identification\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n action_info is an invalid action info object. @p isa_name is not an\n isa name supported by this version of the code object manager\n library.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update action info object as out of resources."]
    pub fn amd_comgr_action_info_set_isa_name(
        action_info: amd_comgr_action_info_t,
        isa_name: *const ::std::os::raw::c_char,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Get the isa name and/or isa name length.\n\n @param[in] action_info The action info object to query.\n\n @param[in, out] size On entry, the size of @p isa_name. On return, if @p\n isa_name is NULL, set to the size of the isa name including the terminating\n null character.\n\n @param[out] isa_name If not NULL, then the first @p size characters of the\n isa name are copied into @p isa_name. If the isa name is not set then an\n empty string is copied into @p isa_name. If NULL, no name is copied, and\n only @p size is updated (useful in order to find the size of buffer required\n to copy the name).\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n action_info is an invalid action info object. @p size is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub fn amd_comgr_action_info_get_isa_name(
        action_info: amd_comgr_action_info_t,
        size: *mut usize,
        isa_name: *mut ::std::os::raw::c_char,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Set the source language of an action info object.\n\n When an action info object is created it has no language defined\n which is represented by @p\n AMD_COMGR_LANGUAGE_NONE. Some actions require that\n the action info object has a source language defined.\n\n @param[in] action_info A handle to the action info object to be\n updated.\n\n @param[in] language The language to set. If @p\n AMD_COMGR_LANGUAGE_NONE then the language is cleared.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n action_info is an invalid action info object. @p language is an\n invalid language.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update action info object as out of resources."]
    pub fn amd_comgr_action_info_set_language(
        action_info: amd_comgr_action_info_t,
        language: amd_comgr_language_t,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Get the language for an action info object.\n\n @param[in] action_info The action info object to query.\n\n @param[out] language The language of the action info opject. @p\n AMD_COMGR_LANGUAGE_NONE if not defined,\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n action_info is an invalid action info object. @p language is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub fn amd_comgr_action_info_get_language(
        action_info: amd_comgr_action_info_t,
        language: *mut amd_comgr_language_t,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Set the options string of an action info object.\n\n When an action info object is created it has an empty options string.\n\n This overrides any option strings or arrays previously set by calls to this\n function or @p amd_comgr_action_info_set_option_list.\n\n An @p action_info object which had its options set with this function can\n only have its option inspected with @p amd_comgr_action_info_get_options.\n\n @param[in] action_info A handle to the action info object to be\n updated.\n\n @param[in] options A null terminated string that is the options. If\n NULL or the empty string then the options are cleared.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n action_info is an invalid action info object.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update action info object as out of resources.\n\n @deprecated since 1.3\n @see amd_comgr_action_info_set_option_list"]
    pub fn amd_comgr_action_info_set_options(
        action_info: amd_comgr_action_info_t,
        options: *const ::std::os::raw::c_char,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Get the options string and/or options strings length of an action\n info object.\n\n The @p action_info object must have had its options set with @p\n amd_comgr_action_info_set_options.\n\n @param[in] action_info The action info object to query.\n\n @param[in, out] size On entry, the size of @p options. On return, if @p\n options is NULL, set to the size of the options including the terminating\n null character.\n\n @param[out] options If not NULL, then the first @p size characters of\n the options are copied. If the options are not set then an empty\n string is copied. If NULL, options is not copied, and only @p size\n is updated (useful inorder to find the size of buffer required to\n copy the options).\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR The options of @p action_info were not set\n with @p amd_comgr_action_info_set_options.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n action_info is an invalid action info object. @p size is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources.\n\n @deprecated since 1.3\n @see amd_comgr_action_info_get_option_list_count and\n amd_comgr_action_info_get_option_list_item"]
    pub fn amd_comgr_action_info_get_options(
        action_info: amd_comgr_action_info_t,
        size: *mut usize,
        options: *mut ::std::os::raw::c_char,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Set the options array of an action info object.\n\n This overrides any option strings or arrays previously set by calls to this\n function or @p amd_comgr_action_info_set_options.\n\n An @p action_info object which had its options set with this function can\n only have its option inspected with @p\n amd_comgr_action_info_get_option_list_count and @p\n amd_comgr_action_info_get_option_list_item.\n\n @param[in] action_info A handle to the action info object to be updated.\n\n @param[in] options An array of null terminated strings. May be NULL if @p\n count is zero, which will result in an empty options array.\n\n @param[in] count The number of null terminated strings in @p options.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed\n successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p action_info is an\n invalid action info object, or @p options is NULL and @p count is non-zero.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to update action\n info object as out of resources."]
    pub fn amd_comgr_action_info_set_option_list(
        action_info: amd_comgr_action_info_t,
        options: *mut *const ::std::os::raw::c_char,
        count: usize,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Return the number of options in the options array.\n\n The @p action_info object must have had its options set with @p\n amd_comgr_action_info_set_option_list.\n\n @param[in] action_info The action info object to query.\n\n @param[out] count The number of options in the options array.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed\n successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR The options of @p action_info were never\n set, or not set with @p amd_comgr_action_info_set_option_list.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p action_info is an\n invalid action info object, or @p count is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to query the data\n object as out of resources."]
    pub fn amd_comgr_action_info_get_option_list_count(
        action_info: amd_comgr_action_info_t,
        count: *mut usize,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Return the Nth option string in the options array and/or that\n option's length.\n\n The @p action_info object must have had its options set with @p\n amd_comgr_action_info_set_option_list.\n\n @param[in] action_info The action info object to query.\n\n @param[in] index The index of the option to be returned. The first option\n index is 0. The order is the same as the options when they were added in @p\n amd_comgr_action_info_set_options.\n\n @param[in, out] size On entry, the size of @p option. On return, if @option\n is NULL, set to the size of the Nth option string including the terminating\n null character.\n\n @param[out] option If not NULL, then the first @p size characters of the Nth\n option string are copied into @p option. If NULL, no option string is\n copied, and only @p size is updated (useful in order to find the size of\n buffer required to copy the option string).\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed\n successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR The options of @p action_info were never\n set, or not set with @p amd_comgr_action_info_set_option_list.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p action_info is an\n invalid action info object, @p index is invalid, or @p size is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to query the data\n object as out of resources."]
    pub fn amd_comgr_action_info_get_option_list_item(
        action_info: amd_comgr_action_info_t,
        index: usize,
        size: *mut usize,
        option: *mut ::std::os::raw::c_char,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Set the working directory of an action info object.\n\n When an action info object is created it has an empty working\n directory. Some actions use the working directory to resolve\n relative file paths.\n\n @param[in] action_info A handle to the action info object to be\n updated.\n\n @param[in] path A null terminated string that is the working\n directory path. If NULL or the empty string then the working\n directory is cleared.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n action_info is an invalid action info object.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update action info object as out of resources."]
    pub fn amd_comgr_action_info_set_working_directory_path(
        action_info: amd_comgr_action_info_t,
        path: *const ::std::os::raw::c_char,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Get the working directory path and/or working directory path\n length of an action info object.\n\n @param[in] action_info The action info object to query.\n\n @param[in, out] size On entry, the size of @p path. On return, if @p path is\n NULL, set to the size of the working directory path including the\n terminating null character.\n\n @param[out] path If not NULL, then the first @p size characters of\n the working directory path is copied. If the working directory path\n is not set then an empty string is copied. If NULL, the working\n directory path is not copied, and only @p size is updated (useful\n in order to find the size of buffer required to copy the working\n directory path).\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n action_info is an invalid action info object. @p size is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub fn amd_comgr_action_info_get_working_directory_path(
        action_info: amd_comgr_action_info_t,
        size: *mut usize,
        path: *mut ::std::os::raw::c_char,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Set whether logging is enabled for an action info object.\n\n @param[in] action_info A handle to the action info object to be\n updated.\n\n @param[in] logging Whether logging should be enabled or disable.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n action_info is an invalid action info object.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update action info object as out of resources."]
    pub fn amd_comgr_action_info_set_logging(
        action_info: amd_comgr_action_info_t,
        logging: bool,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Get whether logging is enabled for an action info object.\n\n @param[in] action_info The action info object to query.\n\n @param[out] logging Whether logging is enabled.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n action_info is an invalid action info object. @p logging is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub fn amd_comgr_action_info_get_logging(
        action_info: amd_comgr_action_info_t,
        logging: *mut bool,
    ) -> amd_comgr_status_t;
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Preprocess each source data object in @p input in order. For each\n successful preprocessor invocation, add a source data object to @p result.\n Resolve any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the working\n directory path in @p info. Preprocess the source for the language in @p\n info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any preprocessing fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
    pub const AMD_COMGR_ACTION_SOURCE_TO_PREPROCESSOR: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(0);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Copy all existing data objects in @p input to @p output, then add the\n device-specific and language-specific precompiled headers required for\n compilation.\n\n Currently the only supported languages are @p AMD_COMGR_LANGUAGE_OPENCL_1_2\n and @p AMD_COMGR_LANGUAGE_OPENCL_2_0.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name or language\n is not set in @p info, or the language is not supported."]
    pub const AMD_COMGR_ACTION_ADD_PRECOMPILED_HEADERS: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(1);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Compile each source data object in @p input in order. For each\n successful compilation add a bc data object to @p result. Resolve\n any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce bc for isa name in @p\n info. Compile the source for the language in @p info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
    pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(2);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Copy all existing data objects in @p input to @p output, then add the\n device-specific and language-specific bitcode libraries required for\n compilation.\n\n Currently the only supported languages are @p AMD_COMGR_LANGUAGE_OPENCL_1_2,\n @p AMD_COMGR_LANGUAGE_OPENCL_2_0, and @p AMD_COMGR_LANGUAGE_HIP.\n\n The options in @p info should be set to a set of language-specific flags.\n For OpenCL and HIP these include:\n\n    correctly_rounded_sqrt\n    daz_opt\n    finite_only\n    unsafe_math\n    wavefrontsize64\n\n For example, to enable daz_opt and unsafe_math, the options should be set\n as:\n\n    const char *options[] = {\"daz_opt, \"unsafe_math\"};\n    size_t optionsCount = sizeof(options) / sizeof(options[0]);\n    amd_comgr_action_info_set_option_list(info, options, optionsCount);\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name or language\n is not set in @p info, the language is not supported, an unknown\n language-specific flag is supplied, or a language-specific flag is\n repeated.\n\n @deprecated since 1.7\n @warning This action, followed by @c AMD_COMGR_ACTION_LINK_BC_TO_BC, may\n result in subtle bugs due to incorrect linking of the device libraries.\n The @c AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC action can\n be used as a workaround which ensures the link occurs correctly."]
    pub const AMD_COMGR_ACTION_ADD_DEVICE_LIBRARIES: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(3);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Link a collection of bitcodes, bundled bitcodes, and bundled bitcode\n archives in @p into a single composite (unbundled) bitcode @p.\n Any device library bc data object must be explicitly added to @p input if\n needed.\n\n Return @p AMD_COMGR_STATUS_ERROR if the link or unbundling fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if IsaName is not set in @p info and does not match the isa name\n of all bc data objects in @p input, or if the Name field is not set for\n any DataObject in the input set."]
    pub const AMD_COMGR_ACTION_LINK_BC_TO_BC: amd_comgr_action_kind_s = amd_comgr_action_kind_s(4);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Optimize each bc data object in @p input and create an optimized bc data\n object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if the optimization fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all bc data objects in @p input."]
    pub const AMD_COMGR_ACTION_OPTIMIZE_BC_TO_BC: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(5);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Perform code generation for each bc data object in @p input in\n order. For each successful code generation add a relocatable data\n object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any code\n generation fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all bc data objects in @p input."]
    pub const AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(6);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Perform code generation for each bc data object in @p input in\n order. For each successful code generation add an assembly source data\n object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any code\n generation fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all bc data objects in @p input."]
    pub const AMD_COMGR_ACTION_CODEGEN_BC_TO_ASSEMBLY: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(7);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Link each relocatable data object in @p input together and add\n the linked relocatable data object to @p result. Any device\n library relocatable data object must be explicitly added to @p\n input if needed.\n\n Return @p AMD_COMGR_STATUS_ERROR if the link fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."]
    pub const AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_RELOCATABLE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(8);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Link each relocatable data object in @p input together and add\n the linked executable data object to @p result. Any device\n library relocatable data object must be explicitly added to @p\n input if needed.\n\n Return @p AMD_COMGR_STATUS_ERROR if the link fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."]
    pub const AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(9);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Assemble each source data object in @p input in order into machine code.\n For each successful assembly add a relocatable data object to @p result.\n Resolve any include source names using the names of include data objects in\n @p input. Resolve any include relative path names using the working\n directory path in @p info. Produce relocatable for isa name in @p info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any assembly fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name is not set in\n @p info."]
    pub const AMD_COMGR_ACTION_ASSEMBLE_SOURCE_TO_RELOCATABLE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(10);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Disassemble each relocatable data object in @p input in\n order. For each successful disassembly add a source data object to\n @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any disassembly\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."]
    pub const AMD_COMGR_ACTION_DISASSEMBLE_RELOCATABLE_TO_SOURCE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(11);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Disassemble each executable data object in @p input in order. For\n each successful disassembly add a source data object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any disassembly\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."]
    pub const AMD_COMGR_ACTION_DISASSEMBLE_EXECUTABLE_TO_SOURCE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(12);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Disassemble each bytes data object in @p input in order. For each\n successful disassembly add a source data object to @p\n result. Only simple assembly language commands are generate that\n corresponf to raw bytes are supported, not any directives that\n control the code object layout, or symbolic branch targets or\n names.\n\n Return @p AMD_COMGR_STATUS_ERROR if any disassembly\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info"]
    pub const AMD_COMGR_ACTION_DISASSEMBLE_BYTES_TO_SOURCE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(13);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Compile each source data object in @p input in order. For each\n successful compilation add a fat binary to @p result. Resolve\n any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce fat binary for isa name in @p\n info. Compile the source for the language in @p info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info.\n\n @deprecated since 2.5\n @see in-process compilation via AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC, etc.\n insteaad"]
    pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_FATBIN: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(14);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Compile each source data object in @p input in order. For each\n successful compilation add a bc data object to @p result. Resolve\n any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce bc for isa name in @p\n info. Compile the source for the language in @p info. Link against\n the device-specific and language-specific bitcode device libraries\n required for compilation.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
    pub const AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(15);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Compile a single source data object in @p input in order. For each\n successful compilation add a relocatable data object to @p result.\n Resolve any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce relocatable for hip name in @p\n info. Compile the source for the language in @p info. Link against\n the device-specific and language-specific bitcode device libraries\n required for compilation. Currently only supports HIP language.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
    pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_RELOCATABLE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(16);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Compile each source data object in @p input and create a single executabele\n in @p result. Resolve any include source names using the names of include\n data objects in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce executable for isa name in @p\n info. Compile the source for the language in @p info. Link against\n the device-specific and language-specific bitcode device libraries\n required for compilation.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
    pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_EXECUTABLE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(17);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Marker for last valid action kind."]
    pub const AMD_COMGR_ACTION_LAST: amd_comgr_action_kind_s = amd_comgr_action_kind_s(17);
 }
 #[repr(transparent)]
 #[doc = " @brief The kinds of actions that can be performed."]
 #[derive(Copy, Clone, Hash, PartialEq, Eq)]
 pub struct amd_comgr_action_kind_s(pub ::std::os::raw::c_uint);
 #[doc = " @brief The kinds of actions that can be performed."]
 pub use self::amd_comgr_action_kind_s as amd_comgr_action_kind_t;
 extern "C" {
    #[must_use]
    #[doc = " @brief Perform an action.\n\n Each action ignores any data objects in @p input that it does not\n use. If logging is enabled in @info then @p result will have a log\n data object added. Any diagnostic data objects produced by the\n action will be added to @p result. See the description of each\n action in @p amd_comgr_action_kind_t.\n\n @param[in] kind The action to perform.\n\n @param[in] info The action info to use when performing the action.\n\n @param[in] input The input data objects to the @p kind action.\n\n @param[out] result Any data objects are removed before performing\n the action which then adds all data objects produced by the action.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR An error was\n reported when executing the action.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n kind is an invalid action kind. @p input_data or @p result_data are\n invalid action data object handles. See the description of each\n action in @p amd_comgr_action_kind_t for other\n conditions that result in this status.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub fn amd_comgr_do_action(
        kind: amd_comgr_action_kind_t,
        info: amd_comgr_action_info_t,
        input: amd_comgr_data_set_t,
        result: amd_comgr_data_set_t,
    ) -> amd_comgr_status_t;
 }
 impl amd_comgr_metadata_kind_s {
    #[doc = " The NULL metadata handle."]
    pub const AMD_COMGR_METADATA_KIND_NULL: amd_comgr_metadata_kind_s =
        amd_comgr_metadata_kind_s(0);
 }
 impl amd_comgr_metadata_kind_s {
    #[doc = " A sting value."]
    pub const AMD_COMGR_METADATA_KIND_STRING: amd_comgr_metadata_kind_s =
        amd_comgr_metadata_kind_s(1);
 }
 impl amd_comgr_metadata_kind_s {
    #[doc = " A map that consists of a set of key and value pairs."]
    pub const AMD_COMGR_METADATA_KIND_MAP: amd_comgr_metadata_kind_s = amd_comgr_metadata_kind_s(2);
 }
 impl amd_comgr_metadata_kind_s {
    #[doc = " A list that consists of a sequence of values."]
    pub const AMD_COMGR_METADATA_KIND_LIST: amd_comgr_metadata_kind_s =
        amd_comgr_metadata_kind_s(3);
 }
 impl amd_comgr_metadata_kind_s {
    #[doc = " Marker for last valid metadata kind."]
    pub const AMD_COMGR_METADATA_KIND_LAST: amd_comgr_metadata_kind_s =
        amd_comgr_metadata_kind_s(3);
 }
 #[repr(transparent)]
 #[doc = " @brief The kinds of metadata nodes."]
 #[derive(Copy, Clone, Hash, PartialEq, Eq)]
 pub struct amd_comgr_metadata_kind_s(pub ::std::os::raw::c_uint);
 #[doc = " @brief The kinds of metadata nodes."]
 pub use self::amd_comgr_metadata_kind_s as amd_comgr_metadata_kind_t;
 extern "C" {
    #[must_use]
    #[doc = " @brief Get the kind of the metadata node.\n\n @param[in] metadata The metadata node to query.\n\n @param[out] kind The kind of the metadata node.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n metadata is an invalid metadata node. @p kind is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to create the data object as out of resources."]
    pub fn amd_comgr_get_metadata_kind(
        metadata: amd_comgr_metadata_node_t,
        kind: *mut amd_comgr_metadata_kind_t,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Get the string and/or string length from a metadata string\n node.\n\n @param[in] metadata The metadata node to query.\n\n @param[in, out] size On entry, the size of @p string. On return, if @p\n string is NULL, set to the size of the string including the terminating null\n character.\n\n @param[out] string If not NULL, then the first @p size characters\n of the string are copied. If NULL, no string is copied, and only @p\n size is updated (useful in order to find the size of buffer required\n to copy the string).\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n metadata is an invalid metadata node, or does not have kind @p\n AMD_COMGR_METADATA_KIND_STRING. @p size is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub fn amd_comgr_get_metadata_string(
        metadata: amd_comgr_metadata_node_t,
        size: *mut usize,
        string: *mut ::std::os::raw::c_char,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Get the map size from a metadata map node.\n\n @param[in] metadata The metadata node to query.\n\n @param[out] size The number of entries in the map.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n metadata is an invalid metadata node, or not of kind @p\n AMD_COMGR_METADATA_KIND_MAP. @p size is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub fn amd_comgr_get_metadata_map_size(
        metadata: amd_comgr_metadata_node_t,
        size: *mut usize,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Iterate over the elements a metadata map node.\n\n @warning The metadata nodes which are passed to the callback are not owned\n by the callback, and are freed just after the callback returns. The callback\n must not save any references to its parameters between iterations.\n\n @param[in] metadata The metadata node to query.\n\n @param[in] callback The function to call for each entry in the map. The\n entry's key is passed in @p key, the entry's value is passed in @p value, and\n @p user_data is passed as @p user_data. If the function returns with a status\n other than @p AMD_COMGR_STATUS_SUCCESS then iteration is stopped.\n\n @param[in] user_data The value to pass to each invocation of @p\n callback. Allows context to be passed into the call back function.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR An error was\n reported by @p callback.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n metadata is an invalid metadata node, or not of kind @p\n AMD_COMGR_METADATA_KIND_MAP. @p callback is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to iterate the metadata as out of resources."]
    pub fn amd_comgr_iterate_map_metadata(
        metadata: amd_comgr_metadata_node_t,
        callback: ::std::option::Option<
            unsafe extern "C" fn(
                key: amd_comgr_metadata_node_t,
                value: amd_comgr_metadata_node_t,
                user_data: *mut ::std::os::raw::c_void,
            ) -> amd_comgr_status_t,
        >,
        user_data: *mut ::std::os::raw::c_void,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Use a string key to lookup an element of a metadata map\n node and return the entry value.\n\n @param[in] metadata The metadata node to query.\n\n @param[in] key A null terminated string that is the key to lookup.\n\n @param[out] value The metadata node of the @p key element of the\n @p metadata map metadata node. The handle must be destroyed\n using @c amd_comgr_destroy_metadata.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR The map has no entry\n with a string key with the value @p key.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n metadata is an invalid metadata node, or not of kind @p\n AMD_COMGR_METADATA_KIND_MAP. @p key or @p value is\n NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to lookup metadata as out of resources."]
    pub fn amd_comgr_metadata_lookup(
        metadata: amd_comgr_metadata_node_t,
        key: *const ::std::os::raw::c_char,
        value: *mut amd_comgr_metadata_node_t,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Get the list size from a metadata list node.\n\n @param[in] metadata The metadata node to query.\n\n @param[out] size The number of entries in the list.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n metadata is an invalid metadata node, or does nopt have kind @p\n AMD_COMGR_METADATA_KIND_LIST. @p size is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub fn amd_comgr_get_metadata_list_size(
        metadata: amd_comgr_metadata_node_t,
        size: *mut usize,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Return the Nth metadata node of a list metadata node.\n\n @param[in] metadata The metadata node to query.\n\n @param[in] index The index being requested. The first list element\n is index 0.\n\n @param[out] value The metadata node of the @p index element of the\n @p metadata list metadata node. The handle must be destroyed\n using @c amd_comgr_destroy_metadata.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n metadata is an invalid metadata node or not of kind @p\n AMD_COMGR_METADATA_INFO_LIST. @p index is greater\n than the number of list elements. @p value is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update action data object as out of resources."]
    pub fn amd_comgr_index_list_metadata(
        metadata: amd_comgr_metadata_node_t,
        index: usize,
        value: *mut amd_comgr_metadata_node_t,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Iterate over the symbols of a machine code object.\n\n For a AMD_COMGR_DATA_KIND_RELOCATABLE the symbols in the ELF symtab section\n are iterated. For a AMD_COMGR_DATA_KIND_EXECUTABLE the symbols in the ELF\n dynsymtab are iterated.\n\n @param[in] data The data object to query.\n\n @param[in] callback The function to call for each symbol in the machine code\n data object. The symbol handle is passed in @p symbol and @p user_data is\n passed as @p user_data. If the function returns with a status other than @p\n AMD_COMGR_STATUS_SUCCESS then iteration is stopped.\n\n @param[in] user_data The value to pass to each invocation of @p\n callback. Allows context to be passed into the call back function.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR An error was\n reported by @p callback.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data is an invalid data\n object, or not of kind @p AMD_COMGR_DATA_KIND_RELOCATABLE or\n AMD_COMGR_DATA_KIND_EXECUTABLE. @p callback is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to iterate the data object as out of resources."]
    pub fn amd_comgr_iterate_symbols(
        data: amd_comgr_data_t,
        callback: ::std::option::Option<
            unsafe extern "C" fn(
                symbol: amd_comgr_symbol_t,
                user_data: *mut ::std::os::raw::c_void,
            ) -> amd_comgr_status_t,
        >,
        user_data: *mut ::std::os::raw::c_void,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Lookup a symbol in a machine code object by name.\n\n For a AMD_COMGR_DATA_KIND_RELOCATABLE the symbols in the ELF symtab section\n are inspected. For a AMD_COMGR_DATA_KIND_EXECUTABLE the symbols in the ELF\n dynsymtab are inspected.\n\n @param[in] data The data object to query.\n\n @param[in] name A null terminated string that is the symbol name to lookup.\n\n @param[out] symbol The symbol with the @p name.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR The machine code object has no symbol\n with @p name.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data is an invalid data\n object, or not of kind @p AMD_COMGR_DATA_KIND_RELOCATABLE or\n AMD_COMGR_DATA_KIND_EXECUTABLE.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to lookup symbol as out of resources."]
    pub fn amd_comgr_symbol_lookup(
        data: amd_comgr_data_t,
        name: *const ::std::os::raw::c_char,
        symbol: *mut amd_comgr_symbol_t,
    ) -> amd_comgr_status_t;
 }
 impl amd_comgr_symbol_type_s {
    #[doc = " The symbol's type is unknown.\n\n The user should not infer any specific type for symbols which return\n `AMD_COMGR_SYMBOL_TYPE_UNKNOWN`, and these symbols may return different\n types in future releases."]
    pub const AMD_COMGR_SYMBOL_TYPE_UNKNOWN: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(-1);
 }
 impl amd_comgr_symbol_type_s {
    #[doc = " The symbol's type is not specified."]
    pub const AMD_COMGR_SYMBOL_TYPE_NOTYPE: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(0);
 }
 impl amd_comgr_symbol_type_s {
    #[doc = " The symbol is associated with a data object, such as a variable, an array,\n and so on."]
    pub const AMD_COMGR_SYMBOL_TYPE_OBJECT: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(1);
 }
 impl amd_comgr_symbol_type_s {
    #[doc = " The symbol is associated with a function or other executable code."]
    pub const AMD_COMGR_SYMBOL_TYPE_FUNC: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(2);
 }
 impl amd_comgr_symbol_type_s {
    #[doc = " The symbol is associated with a section. Symbol table entries of this type\n exist primarily for relocation."]
    pub const AMD_COMGR_SYMBOL_TYPE_SECTION: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(3);
 }
 impl amd_comgr_symbol_type_s {
    #[doc = " Conventionally, the symbol's name gives the name of the source file\n associated with the object file."]
    pub const AMD_COMGR_SYMBOL_TYPE_FILE: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(4);
 }
 impl amd_comgr_symbol_type_s {
    #[doc = " The symbol labels an uninitialized common block."]
    pub const AMD_COMGR_SYMBOL_TYPE_COMMON: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(5);
 }
 impl amd_comgr_symbol_type_s {
    #[doc = " The symbol is associated with an AMDGPU Code Object V2 kernel function."]
    pub const AMD_COMGR_SYMBOL_TYPE_AMDGPU_HSA_KERNEL: amd_comgr_symbol_type_s =
        amd_comgr_symbol_type_s(10);
 }
 #[repr(transparent)]
 #[doc = " @brief Machine code object symbol type."]
 #[derive(Copy, Clone, Hash, PartialEq, Eq)]
 pub struct amd_comgr_symbol_type_s(pub ::std::os::raw::c_int);
 #[doc = " @brief Machine code object symbol type."]
 pub use self::amd_comgr_symbol_type_s as amd_comgr_symbol_type_t;
 impl amd_comgr_symbol_info_s {
    #[doc = " The length of the symbol name in bytes. Does not include the NUL\n terminator. The type of this attribute is uint64_t."]
    pub const AMD_COMGR_SYMBOL_INFO_NAME_LENGTH: amd_comgr_symbol_info_s =
        amd_comgr_symbol_info_s(0);
 }
 impl amd_comgr_symbol_info_s {
    #[doc = " The name of the symbol. The type of this attribute is character array with\n the length equal to the value of the @p AMD_COMGR_SYMBOL_INFO_NAME_LENGTH\n attribute plus 1 for a NUL terminator."]
    pub const AMD_COMGR_SYMBOL_INFO_NAME: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(1);
 }
 impl amd_comgr_symbol_info_s {
    #[doc = " The kind of the symbol. The type of this attribute is @p\n amd_comgr_symbol_type_t."]
    pub const AMD_COMGR_SYMBOL_INFO_TYPE: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(2);
 }
 impl amd_comgr_symbol_info_s {
    #[doc = " Size of the variable. The value of this attribute is undefined if the\n symbol is not a variable. The type of this attribute is uint64_t."]
    pub const AMD_COMGR_SYMBOL_INFO_SIZE: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(3);
 }
 impl amd_comgr_symbol_info_s {
    #[doc = " Indicates whether the symbol is undefined. The type of this attribute is\n bool."]
    pub const AMD_COMGR_SYMBOL_INFO_IS_UNDEFINED: amd_comgr_symbol_info_s =
        amd_comgr_symbol_info_s(4);
 }
 impl amd_comgr_symbol_info_s {
    #[doc = " The value of the symbol. The type of this attribute is uint64_t."]
    pub const AMD_COMGR_SYMBOL_INFO_VALUE: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(5);
 }
 impl amd_comgr_symbol_info_s {
    #[doc = " Marker for last valid symbol info."]
    pub const AMD_COMGR_SYMBOL_INFO_LAST: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(5);
 }
 #[repr(transparent)]
 #[doc = " @brief Machine code object symbol attributes."]
 #[derive(Copy, Clone, Hash, PartialEq, Eq)]
 pub struct amd_comgr_symbol_info_s(pub ::std::os::raw::c_uint);
 #[doc = " @brief Machine code object symbol attributes."]
 pub use self::amd_comgr_symbol_info_s as amd_comgr_symbol_info_t;
 extern "C" {
    #[must_use]
    #[doc = " @brief Query information about a machine code object symbol.\n\n @param[in] symbol The symbol to query.\n\n @param[in] attribute Attribute to query.\n\n @param[out] value Pointer to an application-allocated buffer where to store\n the value of the attribute. If the buffer passed by the application is not\n large enough to hold the value of attribute, the behavior is undefined. The\n type of value returned is specified by @p amd_comgr_symbol_info_t.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR The @p symbol does not have the requested @p\n attribute.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p symbol is an invalid\n symbol. @p attribute is an invalid value. @p value is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to query symbol as out of resources."]
    pub fn amd_comgr_symbol_get_info(
        symbol: amd_comgr_symbol_t,
        attribute: amd_comgr_symbol_info_t,
        value: *mut ::std::os::raw::c_void,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Create a disassembly info object.\n\n @param[in] isa_name A null terminated string that is the isa name of the\n target to disassemble for. The isa name is defined as the Code Object Target\n Identification string, described at\n https://llvm.org/docs/AMDGPUUsage.html#code-object-target-identification\n\n @param[in] read_memory_callback Function called to request @p size bytes\n from the program address space at @p from be read into @p to. The requested\n @p size is never zero. Returns the number of bytes which could be read, with\n the guarantee that no additional bytes will be available in any subsequent\n call.\n\n @param[in] print_instruction_callback Function called after a successful\n disassembly. @p instruction is a null terminated string containing the\n disassembled instruction. The callback does not own @p instruction, and it\n cannot be referenced once the callback returns.\n\n @param[in] print_address_annotation_callback Function called after @c\n print_instruction_callback returns, once for each instruction operand which\n was resolved to an absolute address. @p address is the absolute address in\n the program address space. It is intended to append a symbolic\n form of the address, perhaps as a comment, after the instruction disassembly\n produced by @c print_instruction_callback.\n\n @param[out] disassembly_info A handle to the disassembly info object\n created.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The disassembly info object was created.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p isa_name is NULL or\n invalid; or @p read_memory_callback, @p print_instruction_callback,\n or @p print_address_annotation_callback is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to create the\n disassembly info object as out of resources."]
    pub fn amd_comgr_create_disassembly_info(
        isa_name: *const ::std::os::raw::c_char,
        read_memory_callback: ::std::option::Option<
            unsafe extern "C" fn(
                from: u64,
                to: *mut ::std::os::raw::c_char,
                size: u64,
                user_data: *mut ::std::os::raw::c_void,
            ) -> u64,
        >,
        print_instruction_callback: ::std::option::Option<
            unsafe extern "C" fn(
                instruction: *const ::std::os::raw::c_char,
                user_data: *mut ::std::os::raw::c_void,
            ),
        >,
        print_address_annotation_callback: ::std::option::Option<
            unsafe extern "C" fn(address: u64, user_data: *mut ::std::os::raw::c_void),
        >,
        disassembly_info: *mut amd_comgr_disassembly_info_t,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Destroy a disassembly info object.\n\n @param[in] disassembly_info A handle to the disassembly info object to\n destroy.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The disassembly info object was\n destroyed.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p disassembly_info is an\n invalid disassembly info object.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to destroy the\n disassembly info object as out of resources."]
    pub fn amd_comgr_destroy_disassembly_info(
        disassembly_info: amd_comgr_disassembly_info_t,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Disassemble a single instruction.\n\n @param[in] address The address of the first byte of the instruction in the\n program address space.\n\n @param[in] user_data Arbitrary user-data passed to each callback function\n during disassembly.\n\n @param[out] size The number of bytes consumed to decode the\n instruction, or consumed while failing to decode an invalid instruction.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The disassembly was successful.\n\n @retval ::AMD_COMGR_STATUS_ERROR The disassembly failed.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p disassembly_info is\n invalid or @p size is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to disassemble the\n instruction as out of resources."]
    pub fn amd_comgr_disassemble_instruction(
        disassembly_info: amd_comgr_disassembly_info_t,
        address: u64,
        user_data: *mut ::std::os::raw::c_void,
        size: *mut u64,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Demangle a symbol name.\n\n @param[in] mangled_symbol_name A data object of kind @p\n AMD_COMGR_DATA_KIND_BYTES containing the mangled symbol name.\n\n @param[out] demangled_symbol_name A handle to the data object of kind @p\n AMD_COMGR_DATA_KIND_BYTES created and set to contain the demangled symbol\n name in case of successful completion. The handle must be released using\n @c amd_comgr_release_data. @p demangled_symbol_name is not updated for\n an error case.\n\n @note If the @p mangled_symbol_name cannot be demangled, it will be copied\n without changes to the @p demangled_symbol_name and AMD_COMGR_STATUS_SUCCESS\n is returned.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p mangled_symbol_name is\n an invalid data object or not of kind @p AMD_COMGR_DATA_KIND_BYTES or\n @p demangled_symbol_name is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Out of resources."]
    pub fn amd_comgr_demangle_symbol_name(
        mangled_symbol_name: amd_comgr_data_t,
        demangled_symbol_name: *mut amd_comgr_data_t,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Fetch mangled symbol names from a code object.\n\n @param[in] data A data object of kind @p\n AMD_COMGR_DATA_KIND_EXECUTABLE or @p AMD_COMGR_DATA_KIND_BC\n\n @param[out] count The number of mangled names retrieved. This value\n can be used as an upper bound to the Index provided to the corresponding\n amd_comgr_get_mangled_name() call.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data is\n an invalid data object or not of kind @p AMD_COMGR_DATA_KIND_EXECUTABLE or\n @p AMD_COMGR_DATA_KIND_BC.\n"]
    pub fn amd_comgr_populate_mangled_names(
        data: amd_comgr_data_t,
        count: *mut usize,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Fetch the Nth specific mangled name from a set of populated names or\n that name's length.\n\n The @p data must have had its mangled names populated with @p\n amd_comgr_populate_mangled_names.\n\n @param[in] data A data object of kind @p\n AMD_COMGR_DATA_KIND_EXECUTABLE or @p AMD_COMGR_DATA_KIND_BC used to\n identify which set of mangled names to retrive from.\n\n @param[in] index The index of the mangled name to be returned.\n\n @param[in, out] size For out, the size of @p mangled_name. For in,\n if @mangled_name is NULL, set to the size of the Nth option string including\n the terminating null character.\n\n @param[out] mangled_name If not NULL, then the first @p size characters of\n the Nth mangled name string are copied into @p mangled_name. If NULL, no\n mangled name string is copied, and only @p size is updated (useful in order\n to find the size of the buffer requried to copy the mangled_name string).\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR @p data has not been used to\n populate a set of mangled names, or index is greater than the count of\n mangled names for that data object\n"]
    pub fn amd_comgr_get_mangled_name(
        data: amd_comgr_data_t,
        index: usize,
        size: *mut usize,
        mangled_name: *mut ::std::os::raw::c_char,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Populate a name expression map from a given code object.\n\n Used to map stub names *__amdgcn_name_expr_* in bitcodes and code\n objects generated by hip runtime to an associated (unmangled) name\n expression and (mangled) symbol name.\n\n @param[in] data A data object of kind @p\n AMD_COMGR_DATA_KIND_EXECUTABLE or @p AMD_COMGR_DATA_KIND_BC\n\n @param[out] count The number of name expressions mapped. This value\n can be used as an upper bound to the Index provided to the corresponding\n amd_comgr_map_name_expression_to_symbol_name() call.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data is\n an invalid data object or not of kind @p AMD_COMGR_DATA_KIND_EXECUTABLE or\n @p AMD_COMGR_DATA_KIND_BC.\n\n @retval ::AMD_COMGR_STATUS_ERROR LLVM API failure, which should be\n accompanied by an LLVM error message to stderr\n"]
    pub fn amd_comgr_populate_name_expression_map(
        data: amd_comgr_data_t,
        count: *mut usize,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @brief Fetch a related symbol name for a given name expression;\n or that name's length.\n\n The @p data must have had its name expression map populated with @p\n amd_comgr_populate_name_expression_map.\n\n @param[in] data A data object of kind @p\n AMD_COMGR_DATA_KIND_EXECUTABLE or @p AMD_COMGR_DATA_KIND_BC used to\n identify which map of name expressions to retrieve from.\n\n @param[in, out] size For out, the size of @p symbol_name. For in,\n if @symbol_name is NULL, set to the size of the Nth option string including\n the terminating null character.\n\n @param[in] name_expression A character array of a name expression. This name\n is used as the key to the name expression map in order to locate the desired\n @symbol_name.\n\n @param[out] symbol_name If not NULL, then the first @p size characters of\n the symbol name string mapped from @name_expression are copied into @p\n symbol_name. If NULL, no symbol name string is copied, and only @p size is\n updated (useful in order to find the size of the buffer required to copy the\n symbol_name string).\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR @p data object is not valid (NULL or not of\n type bitcode or code object)\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p name_expression is not\n present in the name expression map.\n"]
    pub fn amd_comgr_map_name_expression_to_symbol_name(
        data: amd_comgr_data_t,
        size: *mut usize,
        name_expression: *mut ::std::os::raw::c_char,
        symbol_name: *mut ::std::os::raw::c_char,
    ) -> amd_comgr_status_t;
 }
 #[doc = " @brief A data structure for Code object information."]
 #[repr(C)]
 #[derive(Copy, Clone)]
 pub struct code_object_info_s {
    #[doc = " ISA name representing the code object."]
    pub isa: *const ::std::os::raw::c_char,
    #[doc = " The size of the code object."]
    pub size: usize,
    pub offset: u64,
 }
 #[doc = " @brief A data structure for Code object information."]
 pub type amd_comgr_code_object_info_t = code_object_info_s;
 extern "C" {
    #[must_use]
    #[doc = " @ brief Given a bundled code object and list of target id strings, extract\n correponding code object information.\n\n @param[in] data The data object for bundled code object. This should be\n of kind AMD_COMGR_DATA_KIND_FATBIN or AMD_COMGR_DATA_KIND_EXECUTABLE or\n AMD_COMGR_DATA_KIND_BYTES. The API interprets the data object of kind\n AMD_COMGR_DATA_KIND_FATBIN as a clang offload bundle and of kind\n AMD_COMGR_DATA_KIND_EXECUTABLE as an executable shared object. For a data\n object of type AMD_COMGR_DATA_KIND_BYTES the API first inspects the data\n passed to determine if it is a fatbin or an executable and performs\n the lookup.\n\n @param[in, out] info_list A list of code object information structure\n initialized with null terminated target id strings. If the target id\n is matched in the code object bundle the corresponding code object\n information is updated with offset and size of the code object. If the\n target id is not found the offset and size are set to 0.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed\n successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR The code object bundle header is incorrect\n or reading bundle entries failed.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data is not of\n kind AMD_COMGR_DATA_KIND_FATBIN, or AMD_COMGR_DATA_KIND_BYTES or\n AMD_COMGR_DATA_KIND_EXECUTABLE or either @p info_list is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if the @p data has\n invalid data."]
    pub fn amd_comgr_lookup_code_object(
        data: amd_comgr_data_t,
        info_list: *mut amd_comgr_code_object_info_t,
        info_list_size: usize,
    ) -> amd_comgr_status_t;
 }
 extern "C" {
    #[must_use]
    #[doc = " @ brief Given a code object and an ELF virtual address, map the ELF virtual\n address to a code object offset. Also, determine if the ELF virtual address\n maps to an offset in a data region that is defined by the ELF file, but that\n does not occupy bytes in the ELF file. This is typically true of offsets that\n that refer to runtime or heap allocated memory. For ELF files with defined\n sections, these data regions are referred to as NOBITS or .bss sections.\n\n @param[in] data The data object to be inspected for the given ELF virtual\n address. This should be of kind AMD_COMGR_DATA_KIND_EXECUTABLE.\n\n @param[in] elf_virtual_address The address used to calculate the code object\n offset.\n\n @param[out] code_object_offset The code object offset returned to the caller\n based on the given ELF virtual address.\n\n @param[out] slice_size For nobits regions: the size in bytes, starting from\n the provided virtual address up to the end of the segment. In this case, the\n slice size represents the number of contiguous unreadable addresses following\n the provided address.\n\n For bits regions: the size in bytes, starting from the provided virtual\n address up to either the end of the segment, or the start of a NOBITS region.\n In this case, slice size represents the number of contiguous readable\n addresses following the provided address.\n\n @param[out] nobits Set to true if the code object offset points to a location\n in a data region that does not occupy bytes in the ELF file, as described\n above.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed\n successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR The provided code object has an invalid\n header due to a mismatch in magic, class, data, version, abi, type, or\n machine.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data is not of\n kind AMD_COMGR_DATA_KIND_EXECUTABLE or invalid, or that the provided @p\n elf_virtual_address is not within the ranges covered by the object's\n load-type program headers."]
    pub fn amd_comgr_map_elf_virtual_address_to_code_object_offset(
        data: amd_comgr_data_t,
        elf_virtual_address: u64,
        code_object_offset: *mut u64,
        slice_size: *mut u64,
        nobits: *mut bool,
    ) -> amd_comgr_status_t;
 }
--- a/ext/amd_comgr-sys/src/comgr2.rs
+++ b/ext/amd_comgr-sys/src/comgr2.rs
@ -0,0 +1,596 @@
 /* automatically generated by rust-bindgen 0.71.1 */
 pub const AMD_COMGR_INTERFACE_VERSION_MAJOR: u32 = 2;
 pub const AMD_COMGR_INTERFACE_VERSION_MINOR: u32 = 8;
 impl amd_comgr_status_s {
    #[doc = " A generic error has occurred."]
    pub const AMD_COMGR_STATUS_ERROR: amd_comgr_status_s =
        amd_comgr_status_s(unsafe { ::std::num::NonZeroU32::new_unchecked(1) });
 }
 impl amd_comgr_status_s {
    #[doc = " One of the actual arguments does not meet a precondition stated\n in the documentation of the corresponding formal argument. This\n includes both invalid Action types, and invalid arguments to\n valid Action types."]
    pub const AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT: amd_comgr_status_s =
        amd_comgr_status_s(unsafe { ::std::num::NonZeroU32::new_unchecked(2) });
 }
 impl amd_comgr_status_s {
    #[doc = " Failed to allocate the necessary resources."]
    pub const AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES: amd_comgr_status_s =
        amd_comgr_status_s(unsafe { ::std::num::NonZeroU32::new_unchecked(3) });
 }
 #[repr(transparent)]
 #[doc = " @brief Status codes."]
 #[derive(Copy, Clone, Hash, PartialEq, Eq, Debug)]
 pub struct amd_comgr_status_s(pub ::std::num::NonZeroU32);
 type amd_comgr_status_t = Result<(), self::amd_comgr_status_s>;
 impl amd_comgr_language_s {
    #[doc = " No high level language."]
    pub const AMD_COMGR_LANGUAGE_NONE: amd_comgr_language_s = amd_comgr_language_s(0);
 }
 impl amd_comgr_language_s {
    #[doc = " OpenCL 1.2."]
    pub const AMD_COMGR_LANGUAGE_OPENCL_1_2: amd_comgr_language_s = amd_comgr_language_s(1);
 }
 impl amd_comgr_language_s {
    #[doc = " OpenCL 2.0."]
    pub const AMD_COMGR_LANGUAGE_OPENCL_2_0: amd_comgr_language_s = amd_comgr_language_s(2);
 }
 impl amd_comgr_language_s {
    #[doc = " AMD Hetrogeneous C++ (HC)."]
    pub const AMD_COMGR_LANGUAGE_HC: amd_comgr_language_s = amd_comgr_language_s(3);
 }
 impl amd_comgr_language_s {
    #[doc = " HIP."]
    pub const AMD_COMGR_LANGUAGE_HIP: amd_comgr_language_s = amd_comgr_language_s(4);
 }
 impl amd_comgr_language_s {
    #[doc = " LLVM IR, either textual (.ll) or bitcode (.bc) format."]
    pub const AMD_COMGR_LANGUAGE_LLVM_IR: amd_comgr_language_s = amd_comgr_language_s(5);
 }
 impl amd_comgr_language_s {
    #[doc = " Marker for last valid language."]
    pub const AMD_COMGR_LANGUAGE_LAST: amd_comgr_language_s = amd_comgr_language_s(5);
 }
 #[repr(transparent)]
 #[doc = " @brief The source languages supported by the compiler."]
 #[derive(Copy, Clone, Hash, PartialEq, Eq)]
 pub struct amd_comgr_language_s(pub ::std::os::raw::c_uint);
 #[doc = " @brief The source languages supported by the compiler."]
 pub use self::amd_comgr_language_s as amd_comgr_language_t;
 impl amd_comgr_data_kind_s {
    #[doc = " No data is available."]
    pub const AMD_COMGR_DATA_KIND_UNDEF: amd_comgr_data_kind_s = amd_comgr_data_kind_s(0);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a textual main source."]
    pub const AMD_COMGR_DATA_KIND_SOURCE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(1);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a textual source that is included in the main source\n or other include source."]
    pub const AMD_COMGR_DATA_KIND_INCLUDE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(2);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a precompiled-header source that is included in the main\n source or other include source."]
    pub const AMD_COMGR_DATA_KIND_PRECOMPILED_HEADER: amd_comgr_data_kind_s =
        amd_comgr_data_kind_s(3);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a diagnostic output."]
    pub const AMD_COMGR_DATA_KIND_DIAGNOSTIC: amd_comgr_data_kind_s = amd_comgr_data_kind_s(4);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a textual log output."]
    pub const AMD_COMGR_DATA_KIND_LOG: amd_comgr_data_kind_s = amd_comgr_data_kind_s(5);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is compiler LLVM IR bit code for a specific isa."]
    pub const AMD_COMGR_DATA_KIND_BC: amd_comgr_data_kind_s = amd_comgr_data_kind_s(6);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a relocatable machine code object for a specific isa."]
    pub const AMD_COMGR_DATA_KIND_RELOCATABLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(7);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is an executable machine code object for a specific\n isa. An executable is the kind of code object that can be loaded\n and executed."]
    pub const AMD_COMGR_DATA_KIND_EXECUTABLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(8);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a block of bytes."]
    pub const AMD_COMGR_DATA_KIND_BYTES: amd_comgr_data_kind_s = amd_comgr_data_kind_s(9);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a fat binary (clang-offload-bundler output)."]
    pub const AMD_COMGR_DATA_KIND_FATBIN: amd_comgr_data_kind_s = amd_comgr_data_kind_s(16);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is an archive."]
    pub const AMD_COMGR_DATA_KIND_AR: amd_comgr_data_kind_s = amd_comgr_data_kind_s(17);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a bitcode bundle."]
    pub const AMD_COMGR_DATA_KIND_BC_BUNDLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(18);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is an archive bundle."]
    pub const AMD_COMGR_DATA_KIND_AR_BUNDLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(19);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is an object file bundle."]
    pub const AMD_COMGR_DATA_KIND_OBJ_BUNDLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(20);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " Marker for last valid data kind."]
    pub const AMD_COMGR_DATA_KIND_LAST: amd_comgr_data_kind_s = amd_comgr_data_kind_s(20);
 }
 #[repr(transparent)]
 #[doc = " @brief The kinds of data supported."]
 #[derive(Copy, Clone, Hash, PartialEq, Eq)]
 pub struct amd_comgr_data_kind_s(pub ::std::os::raw::c_uint);
 #[doc = " @brief The kinds of data supported."]
 pub use self::amd_comgr_data_kind_s as amd_comgr_data_kind_t;
 #[doc = " @brief A handle to a data object.\n\n Data objects are used to hold the data which is either an input or\n output of a code object manager action."]
 #[repr(C)]
 #[derive(Copy, Clone)]
 pub struct amd_comgr_data_s {
    pub handle: u64,
 }
 #[doc = " @brief A handle to a data object.\n\n Data objects are used to hold the data which is either an input or\n output of a code object manager action."]
 pub type amd_comgr_data_t = amd_comgr_data_s;
 #[doc = " @brief A handle to an action data object.\n\n An action data object holds a set of data objects. These can be\n used as inputs to an action, or produced as the result of an\n action."]
 #[repr(C)]
 #[derive(Copy, Clone)]
 pub struct amd_comgr_data_set_s {
    pub handle: u64,
 }
 #[doc = " @brief A handle to an action data object.\n\n An action data object holds a set of data objects. These can be\n used as inputs to an action, or produced as the result of an\n action."]
 pub type amd_comgr_data_set_t = amd_comgr_data_set_s;
 #[doc = " @brief A handle to an action information object.\n\n An action information object holds all the necessary information,\n excluding the input data objects, required to perform an action."]
 #[repr(C)]
 #[derive(Copy, Clone)]
 pub struct amd_comgr_action_info_s {
    pub handle: u64,
 }
 #[doc = " @brief A handle to an action information object.\n\n An action information object holds all the necessary information,\n excluding the input data objects, required to perform an action."]
 pub type amd_comgr_action_info_t = amd_comgr_action_info_s;
 impl amd_comgr_action_kind_s {
    #[doc = " Preprocess each source data object in @p input in order. For each\n successful preprocessor invocation, add a source data object to @p result.\n Resolve any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the working\n directory path in @p info. Preprocess the source for the language in @p\n info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any preprocessing fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
    pub const AMD_COMGR_ACTION_SOURCE_TO_PREPROCESSOR: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(0);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Copy all existing data objects in @p input to @p output, then add the\n device-specific and language-specific precompiled headers required for\n compilation.\n\n Currently the only supported languages are @p AMD_COMGR_LANGUAGE_OPENCL_1_2\n and @p AMD_COMGR_LANGUAGE_OPENCL_2_0.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name or language\n is not set in @p info, or the language is not supported."]
    pub const AMD_COMGR_ACTION_ADD_PRECOMPILED_HEADERS: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(1);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Compile each source data object in @p input in order. For each\n successful compilation add a bc data object to @p result. Resolve\n any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce bc for isa name in @p\n info. Compile the source for the language in @p info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
    pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(2);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Copy all existing data objects in @p input to @p output, then add the\n device-specific and language-specific bitcode libraries required for\n compilation.\n\n Currently the only supported languages are @p AMD_COMGR_LANGUAGE_OPENCL_1_2,\n @p AMD_COMGR_LANGUAGE_OPENCL_2_0, and @p AMD_COMGR_LANGUAGE_HIP.\n\n The options in @p info should be set to a set of language-specific flags.\n For OpenCL and HIP these include:\n\n    correctly_rounded_sqrt\n    daz_opt\n    finite_only\n    unsafe_math\n    wavefrontsize64\n\n For example, to enable daz_opt and unsafe_math, the options should be set\n as:\n\n    const char *options[] = {\"daz_opt, \"unsafe_math\"};\n    size_t optionsCount = sizeof(options) / sizeof(options[0]);\n    amd_comgr_action_info_set_option_list(info, options, optionsCount);\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name or language\n is not set in @p info, the language is not supported, an unknown\n language-specific flag is supplied, or a language-specific flag is\n repeated.\n\n @deprecated since 1.7\n @warning This action, followed by @c AMD_COMGR_ACTION_LINK_BC_TO_BC, may\n result in subtle bugs due to incorrect linking of the device libraries.\n The @c AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC action can\n be used as a workaround which ensures the link occurs correctly."]
    pub const AMD_COMGR_ACTION_ADD_DEVICE_LIBRARIES: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(3);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Link a collection of bitcodes, bundled bitcodes, and bundled bitcode\n archives in @p into a single composite (unbundled) bitcode @p.\n Any device library bc data object must be explicitly added to @p input if\n needed.\n\n Return @p AMD_COMGR_STATUS_ERROR if the link or unbundling fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if IsaName is not set in @p info and does not match the isa name\n of all bc data objects in @p input, or if the Name field is not set for\n any DataObject in the input set."]
    pub const AMD_COMGR_ACTION_LINK_BC_TO_BC: amd_comgr_action_kind_s = amd_comgr_action_kind_s(4);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Optimize each bc data object in @p input and create an optimized bc data\n object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if the optimization fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all bc data objects in @p input."]
    pub const AMD_COMGR_ACTION_OPTIMIZE_BC_TO_BC: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(5);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Perform code generation for each bc data object in @p input in\n order. For each successful code generation add a relocatable data\n object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any code\n generation fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all bc data objects in @p input."]
    pub const AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(6);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Perform code generation for each bc data object in @p input in\n order. For each successful code generation add an assembly source data\n object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any code\n generation fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all bc data objects in @p input."]
    pub const AMD_COMGR_ACTION_CODEGEN_BC_TO_ASSEMBLY: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(7);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Link each relocatable data object in @p input together and add\n the linked relocatable data object to @p result. Any device\n library relocatable data object must be explicitly added to @p\n input if needed.\n\n Return @p AMD_COMGR_STATUS_ERROR if the link fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."]
    pub const AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_RELOCATABLE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(8);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Link each relocatable data object in @p input together and add\n the linked executable data object to @p result. Any device\n library relocatable data object must be explicitly added to @p\n input if needed.\n\n Return @p AMD_COMGR_STATUS_ERROR if the link fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."]
    pub const AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(9);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Assemble each source data object in @p input in order into machine code.\n For each successful assembly add a relocatable data object to @p result.\n Resolve any include source names using the names of include data objects in\n @p input. Resolve any include relative path names using the working\n directory path in @p info. Produce relocatable for isa name in @p info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any assembly fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name is not set in\n @p info."]
    pub const AMD_COMGR_ACTION_ASSEMBLE_SOURCE_TO_RELOCATABLE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(10);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Disassemble each relocatable data object in @p input in\n order. For each successful disassembly add a source data object to\n @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any disassembly\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."]
    pub const AMD_COMGR_ACTION_DISASSEMBLE_RELOCATABLE_TO_SOURCE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(11);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Disassemble each executable data object in @p input in order. For\n each successful disassembly add a source data object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any disassembly\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."]
    pub const AMD_COMGR_ACTION_DISASSEMBLE_EXECUTABLE_TO_SOURCE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(12);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Disassemble each bytes data object in @p input in order. For each\n successful disassembly add a source data object to @p\n result. Only simple assembly language commands are generate that\n corresponf to raw bytes are supported, not any directives that\n control the code object layout, or symbolic branch targets or\n names.\n\n Return @p AMD_COMGR_STATUS_ERROR if any disassembly\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info"]
    pub const AMD_COMGR_ACTION_DISASSEMBLE_BYTES_TO_SOURCE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(13);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Compile each source data object in @p input in order. For each\n successful compilation add a fat binary to @p result. Resolve\n any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce fat binary for isa name in @p\n info. Compile the source for the language in @p info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info.\n\n @deprecated since 2.5\n @see in-process compilation via AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC, etc.\n insteaad"]
    pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_FATBIN: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(14);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Compile each source data object in @p input in order. For each\n successful compilation add a bc data object to @p result. Resolve\n any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce bc for isa name in @p\n info. Compile the source for the language in @p info. Link against\n the device-specific and language-specific bitcode device libraries\n required for compilation.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
    pub const AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(15);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Compile a single source data object in @p input in order. For each\n successful compilation add a relocatable data object to @p result.\n Resolve any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce relocatable for hip name in @p\n info. Compile the source for the language in @p info. Link against\n the device-specific and language-specific bitcode device libraries\n required for compilation. Currently only supports HIP language.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
    pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_RELOCATABLE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(16);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Compile each source data object in @p input and create a single executabele\n in @p result. Resolve any include source names using the names of include\n data objects in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce executable for isa name in @p\n info. Compile the source for the language in @p info. Link against\n the device-specific and language-specific bitcode device libraries\n required for compilation.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
    pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_EXECUTABLE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(17);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Unbundle each source data object in @p input. These objects can be\n bitcode bundles, or an archive containing bitcode bundles. For each\n successful unbundling, add a bc object or archive object to @p result,\n depending on the corresponding input.\n\n Return @p AMD_COMGR_STATUS_ERROR if any unbundling\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
    pub const AMD_COMGR_ACTION_UNBUNDLE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(18);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Marker for last valid action kind."]
    pub const AMD_COMGR_ACTION_LAST: amd_comgr_action_kind_s = amd_comgr_action_kind_s(18);
 }
 #[repr(transparent)]
 #[doc = " @brief The kinds of actions that can be performed."]
 #[derive(Copy, Clone, Hash, PartialEq, Eq)]
 pub struct amd_comgr_action_kind_s(pub ::std::os::raw::c_uint);
 #[doc = " @brief The kinds of actions that can be performed."]
 pub use self::amd_comgr_action_kind_s as amd_comgr_action_kind_t;
 pub struct Comgr2 {
    __library: ::libloading::Library,
    pub amd_comgr_get_version:
        Result<unsafe extern "C" fn(major: *mut usize, minor: *mut usize), ::libloading::Error>,
    pub amd_comgr_create_data: Result<
        unsafe extern "C" fn(
            kind: amd_comgr_data_kind_t,
            data: *mut amd_comgr_data_t,
        ) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_set_data: Result<
        unsafe extern "C" fn(
            data: amd_comgr_data_t,
            size: usize,
            bytes: *const ::std::os::raw::c_char,
        ) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_set_data_name: Result<
        unsafe extern "C" fn(
            data: amd_comgr_data_t,
            name: *const ::std::os::raw::c_char,
        ) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_get_data: Result<
        unsafe extern "C" fn(
            data: amd_comgr_data_t,
            size: *mut usize,
            bytes: *mut ::std::os::raw::c_char,
        ) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_create_data_set: Result<
        unsafe extern "C" fn(data_set: *mut amd_comgr_data_set_t) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_destroy_data_set: Result<
        unsafe extern "C" fn(data_set: amd_comgr_data_set_t) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_data_set_add: Result<
        unsafe extern "C" fn(
            data_set: amd_comgr_data_set_t,
            data: amd_comgr_data_t,
        ) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_action_data_get_data: Result<
        unsafe extern "C" fn(
            data_set: amd_comgr_data_set_t,
            data_kind: amd_comgr_data_kind_t,
            index: usize,
            data: *mut amd_comgr_data_t,
        ) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_create_action_info: Result<
        unsafe extern "C" fn(action_info: *mut amd_comgr_action_info_t) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_destroy_action_info: Result<
        unsafe extern "C" fn(action_info: amd_comgr_action_info_t) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_action_info_set_isa_name: Result<
        unsafe extern "C" fn(
            action_info: amd_comgr_action_info_t,
            isa_name: *const ::std::os::raw::c_char,
        ) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_action_info_set_language: Result<
        unsafe extern "C" fn(
            action_info: amd_comgr_action_info_t,
            language: amd_comgr_language_t,
        ) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_action_info_set_option_list: Result<
        unsafe extern "C" fn(
            action_info: amd_comgr_action_info_t,
            options: *mut *const ::std::os::raw::c_char,
            count: usize,
        ) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_do_action: Result<
        unsafe extern "C" fn(
            kind: amd_comgr_action_kind_t,
            info: amd_comgr_action_info_t,
            input: amd_comgr_data_set_t,
            result: amd_comgr_data_set_t,
        ) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
 }
 impl Comgr2 {
    pub unsafe fn new<P>(path: P) -> Result<Self, ::libloading::Error>
    where
        P: AsRef<::std::ffi::OsStr>,
    {
        let library = ::libloading::Library::new(path)?;
        Self::from_library(library)
    }
    pub unsafe fn from_library<L>(library: L) -> Result<Self, ::libloading::Error>
    where
        L: Into<::libloading::Library>,
    {
        let __library = library.into();
        let amd_comgr_get_version = __library.get(b"amd_comgr_get_version\0").map(|sym| *sym);
        let amd_comgr_create_data = __library.get(b"amd_comgr_create_data\0").map(|sym| *sym);
        let amd_comgr_set_data = __library.get(b"amd_comgr_set_data\0").map(|sym| *sym);
        let amd_comgr_set_data_name = __library.get(b"amd_comgr_set_data_name\0").map(|sym| *sym);
        let amd_comgr_get_data = __library.get(b"amd_comgr_get_data\0").map(|sym| *sym);
        let amd_comgr_create_data_set = __library
            .get(b"amd_comgr_create_data_set\0")
            .map(|sym| *sym);
        let amd_comgr_destroy_data_set = __library
            .get(b"amd_comgr_destroy_data_set\0")
            .map(|sym| *sym);
        let amd_comgr_data_set_add = __library.get(b"amd_comgr_data_set_add\0").map(|sym| *sym);
        let amd_comgr_action_data_get_data = __library
            .get(b"amd_comgr_action_data_get_data\0")
            .map(|sym| *sym);
        let amd_comgr_create_action_info = __library
            .get(b"amd_comgr_create_action_info\0")
            .map(|sym| *sym);
        let amd_comgr_destroy_action_info = __library
            .get(b"amd_comgr_destroy_action_info\0")
            .map(|sym| *sym);
        let amd_comgr_action_info_set_isa_name = __library
            .get(b"amd_comgr_action_info_set_isa_name\0")
            .map(|sym| *sym);
        let amd_comgr_action_info_set_language = __library
            .get(b"amd_comgr_action_info_set_language\0")
            .map(|sym| *sym);
        let amd_comgr_action_info_set_option_list = __library
            .get(b"amd_comgr_action_info_set_option_list\0")
            .map(|sym| *sym);
        let amd_comgr_do_action = __library.get(b"amd_comgr_do_action\0").map(|sym| *sym);
        Ok(Comgr2 {
            __library,
            amd_comgr_get_version,
            amd_comgr_create_data,
            amd_comgr_set_data,
            amd_comgr_set_data_name,
            amd_comgr_get_data,
            amd_comgr_create_data_set,
            amd_comgr_destroy_data_set,
            amd_comgr_data_set_add,
            amd_comgr_action_data_get_data,
            amd_comgr_create_action_info,
            amd_comgr_destroy_action_info,
            amd_comgr_action_info_set_isa_name,
            amd_comgr_action_info_set_language,
            amd_comgr_action_info_set_option_list,
            amd_comgr_do_action,
        })
    }
    #[doc = " @brief Get the version of the code object manager interface\n supported.\n\n An interface is backwards compatible with an implementation with an\n equal major version, and a greater than or equal minor version.\n\n @param[out] major Major version number.\n\n @param[out] minor Minor version number."]
    pub unsafe fn amd_comgr_get_version(&self, major: *mut usize, minor: *mut usize) {
        (self
            .amd_comgr_get_version
            .as_ref()
            .expect("Expected function, got error."))(major, minor)
    }
    #[must_use]
    #[doc = " @brief Create a data object that can hold data of a specified kind.\n\n Data objects are reference counted and are destroyed when the\n reference count reaches 0. When a data object is created its\n reference count is 1, it has 0 bytes of data, it has an empty name,\n and it has no metadata.\n\n @param[in] kind The kind of data the object is intended to hold.\n\n @param[out] data A handle to the data object created. Its reference\n count is set to 1.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n kind is an invalid data kind, or @p\n AMD_COMGR_DATA_KIND_UNDEF. @p data is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to create the data object as out of resources."]
    pub unsafe fn amd_comgr_create_data(
        &self,
        kind: amd_comgr_data_kind_t,
        data: *mut amd_comgr_data_t,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_create_data
            .as_ref()
            .expect("Expected function, got error."))(kind, data)
    }
    #[must_use]
    #[doc = " @brief Set the data content of a data object to the specified\n bytes.\n\n Any previous value of the data object is overwritten. Any metadata\n associated with the data object is also replaced which invalidates\n all metadata handles to the old metadata.\n\n @param[in] data The data object to update.\n\n @param[in] size The number of bytes in the data specified by @p bytes.\n\n @param[in] bytes The bytes to set the data object to. The bytes are\n copied into the data object and can be freed after the call.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n data is an invalid data object, or has kind @p\n AMD_COMGR_DATA_KIND_UNDEF.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub unsafe fn amd_comgr_set_data(
        &self,
        data: amd_comgr_data_t,
        size: usize,
        bytes: *const ::std::os::raw::c_char,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_set_data
            .as_ref()
            .expect("Expected function, got error."))(data, size, bytes)
    }
    #[must_use]
    #[doc = " @brief Set the name associated with a data object.\n\n When compiling, the full name of an include directive is used to\n reference the contents of the include data object with the same\n name. The name may also be used for other data objects in log and\n diagnostic output.\n\n @param[in] data The data object to update.\n\n @param[in] name A null terminated string that specifies the name to\n use for the data object. If NULL then the name is set to the empty\n string.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n data is an invalid data object, or has kind @p\n AMD_COMGR_DATA_KIND_UNDEF.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub unsafe fn amd_comgr_set_data_name(
        &self,
        data: amd_comgr_data_t,
        name: *const ::std::os::raw::c_char,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_set_data_name
            .as_ref()
            .expect("Expected function, got error."))(data, name)
    }
    #[must_use]
    #[doc = " @brief Get the data contents, and/or the size of the data\n associated with a data object.\n\n @param[in] data The data object to query.\n\n @param[in, out] size On entry, the size of @p bytes. On return, if @p bytes\n is NULL, set to the size of the data object contents.\n\n @param[out] bytes If not NULL, then the first @p size bytes of the\n data object contents is copied. If NULL, no data is copied, and\n only @p size is updated (useful in order to find the size of buffer\n required to copy the data).\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n data is an invalid data object, or has kind @p\n AMD_COMGR_DATA_KIND_UNDEF. @p size is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub unsafe fn amd_comgr_get_data(
        &self,
        data: amd_comgr_data_t,
        size: *mut usize,
        bytes: *mut ::std::os::raw::c_char,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_get_data
            .as_ref()
            .expect("Expected function, got error."))(data, size, bytes)
    }
    #[must_use]
    #[doc = " @brief Create a data set object.\n\n @param[out] data_set A handle to the data set created. Initially it\n contains no data objects.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed\n successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data_set is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to create the data\n set object as out of resources."]
    pub unsafe fn amd_comgr_create_data_set(
        &self,
        data_set: *mut amd_comgr_data_set_t,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_create_data_set
            .as_ref()
            .expect("Expected function, got error."))(data_set)
    }
    #[must_use]
    #[doc = " @brief Destroy a data set object.\n\n The reference counts of any associated data objects are decremented. Any\n handles to the data set object become invalid.\n\n @param[in] data_set A handle to the data set object to destroy.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed\n successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data_set is an invalid\n data set object.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to update data set\n object as out of resources."]
    pub unsafe fn amd_comgr_destroy_data_set(
        &self,
        data_set: amd_comgr_data_set_t,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_destroy_data_set
            .as_ref()
            .expect("Expected function, got error."))(data_set)
    }
    #[must_use]
    #[doc = " @brief Add a data object to a data set object if it is not already added.\n\n The reference count of the data object is incremented.\n\n @param[in] data_set A handle to the data set object to be updated.\n\n @param[in] data A handle to the data object to be added. If @p data_set\n already has the specified handle present, then it is not added. The order\n that data objects are added is preserved.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed\n successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data_set is an invalid\n data set object. @p data is an invalid data object; has undef kind; has\n include kind but does not have a name.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to update data set\n object as out of resources."]
    pub unsafe fn amd_comgr_data_set_add(
        &self,
        data_set: amd_comgr_data_set_t,
        data: amd_comgr_data_t,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_data_set_add
            .as_ref()
            .expect("Expected function, got error."))(data_set, data)
    }
    #[must_use]
    #[doc = " @brief Return the Nth data object of a specified data kind that is added to a\n data set object.\n\n The reference count of the returned data object is incremented.\n\n @param[in] data_set A handle to the data set object to be queried.\n\n @param[in] data_kind The data kind of the data object to be returned.\n\n @param[in] index The index of the data object of data kind @data_kind to be\n returned. The first data object is index 0. The order of data objects matches\n the order that they were added to the data set object.\n\n @param[out] data The data object being requested.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed\n successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data_set is an invalid\n data set object. @p data_kind is an invalid data kind or @p\n AMD_COMGR_DATA_KIND_UNDEF. @p index is greater than the number of data\n objects of kind @p data_kind. @p data is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to query data set\n object as out of resources."]
    pub unsafe fn amd_comgr_action_data_get_data(
        &self,
        data_set: amd_comgr_data_set_t,
        data_kind: amd_comgr_data_kind_t,
        index: usize,
        data: *mut amd_comgr_data_t,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_action_data_get_data
            .as_ref()
            .expect("Expected function, got error."))(data_set, data_kind, index, data)
    }
    #[must_use]
    #[doc = " @brief Create an action info object.\n\n @param[out] action_info A handle to the action info object created.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n action_info is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to create the action info object as out of resources."]
    pub unsafe fn amd_comgr_create_action_info(
        &self,
        action_info: *mut amd_comgr_action_info_t,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_create_action_info
            .as_ref()
            .expect("Expected function, got error."))(action_info)
    }
    #[must_use]
    #[doc = " @brief Destroy an action info object.\n\n @param[in] action_info A handle to the action info object to destroy.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n action_info is an invalid action info object.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update action info object as out of resources."]
    pub unsafe fn amd_comgr_destroy_action_info(
        &self,
        action_info: amd_comgr_action_info_t,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_destroy_action_info
            .as_ref()
            .expect("Expected function, got error."))(action_info)
    }
    #[must_use]
    #[doc = " @brief Set the isa name of an action info object.\n\n When an action info object is created it has no isa name. Some\n actions require that the action info object has an isa name\n defined.\n\n @param[in] action_info A handle to the action info object to be\n updated.\n\n @param[in] isa_name A null terminated string that is the isa name. If NULL\n or the empty string then the isa name is cleared. The isa name is defined as\n the Code Object Target Identification string, described at\n https://llvm.org/docs/AMDGPUUsage.html#code-object-target-identification\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n action_info is an invalid action info object. @p isa_name is not an\n isa name supported by this version of the code object manager\n library.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update action info object as out of resources."]
    pub unsafe fn amd_comgr_action_info_set_isa_name(
        &self,
        action_info: amd_comgr_action_info_t,
        isa_name: *const ::std::os::raw::c_char,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_action_info_set_isa_name
            .as_ref()
            .expect("Expected function, got error."))(action_info, isa_name)
    }
    #[must_use]
    #[doc = " @brief Set the source language of an action info object.\n\n When an action info object is created it has no language defined\n which is represented by @p\n AMD_COMGR_LANGUAGE_NONE. Some actions require that\n the action info object has a source language defined.\n\n @param[in] action_info A handle to the action info object to be\n updated.\n\n @param[in] language The language to set. If @p\n AMD_COMGR_LANGUAGE_NONE then the language is cleared.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n action_info is an invalid action info object. @p language is an\n invalid language.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update action info object as out of resources."]
    pub unsafe fn amd_comgr_action_info_set_language(
        &self,
        action_info: amd_comgr_action_info_t,
        language: amd_comgr_language_t,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_action_info_set_language
            .as_ref()
            .expect("Expected function, got error."))(action_info, language)
    }
    #[must_use]
    #[doc = " @brief Set the options array of an action info object.\n\n This overrides any option strings or arrays previously set by calls to this\n function or @p amd_comgr_action_info_set_options.\n\n An @p action_info object which had its options set with this function can\n only have its option inspected with @p\n amd_comgr_action_info_get_option_list_count and @p\n amd_comgr_action_info_get_option_list_item.\n\n @param[in] action_info A handle to the action info object to be updated.\n\n @param[in] options An array of null terminated strings. May be NULL if @p\n count is zero, which will result in an empty options array.\n\n @param[in] count The number of null terminated strings in @p options.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed\n successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p action_info is an\n invalid action info object, or @p options is NULL and @p count is non-zero.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to update action\n info object as out of resources."]
    pub unsafe fn amd_comgr_action_info_set_option_list(
        &self,
        action_info: amd_comgr_action_info_t,
        options: *mut *const ::std::os::raw::c_char,
        count: usize,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_action_info_set_option_list
            .as_ref()
            .expect("Expected function, got error."))(action_info, options, count)
    }
    #[must_use]
    #[doc = " @brief Perform an action.\n\n Each action ignores any data objects in @p input that it does not\n use. If logging is enabled in @info then @p result will have a log\n data object added. Any diagnostic data objects produced by the\n action will be added to @p result. See the description of each\n action in @p amd_comgr_action_kind_t.\n\n @param[in] kind The action to perform.\n\n @param[in] info The action info to use when performing the action.\n\n @param[in] input The input data objects to the @p kind action.\n\n @param[out] result Any data objects are removed before performing\n the action which then adds all data objects produced by the action.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR An error was\n reported when executing the action.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n kind is an invalid action kind. @p input_data or @p result_data are\n invalid action data object handles. See the description of each\n action in @p amd_comgr_action_kind_t for other\n conditions that result in this status.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub unsafe fn amd_comgr_do_action(
        &self,
        kind: amd_comgr_action_kind_t,
        info: amd_comgr_action_info_t,
        input: amd_comgr_data_set_t,
        result: amd_comgr_data_set_t,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_do_action
            .as_ref()
            .expect("Expected function, got error."))(kind, info, input, result)
    }
 }
--- a/ext/amd_comgr-sys/src/comgr3.rs
+++ b/ext/amd_comgr-sys/src/comgr3.rs
@ -0,0 +1,586 @@
 /* automatically generated by rust-bindgen 0.71.1 */
 pub const AMD_COMGR_INTERFACE_VERSION_MAJOR: u32 = 3;
 pub const AMD_COMGR_INTERFACE_VERSION_MINOR: u32 = 0;
 impl amd_comgr_status_s {
    #[doc = " A generic error has occurred."]
    pub const AMD_COMGR_STATUS_ERROR: amd_comgr_status_s =
        amd_comgr_status_s(unsafe { ::std::num::NonZeroU32::new_unchecked(1) });
 }
 impl amd_comgr_status_s {
    #[doc = " One of the actual arguments does not meet a precondition stated\n in the documentation of the corresponding formal argument. This\n includes both invalid Action types, and invalid arguments to\n valid Action types."]
    pub const AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT: amd_comgr_status_s =
        amd_comgr_status_s(unsafe { ::std::num::NonZeroU32::new_unchecked(2) });
 }
 impl amd_comgr_status_s {
    #[doc = " Failed to allocate the necessary resources."]
    pub const AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES: amd_comgr_status_s =
        amd_comgr_status_s(unsafe { ::std::num::NonZeroU32::new_unchecked(3) });
 }
 #[repr(transparent)]
 #[doc = " @brief Status codes."]
 #[derive(Copy, Clone, Hash, PartialEq, Eq, Debug)]
 pub struct amd_comgr_status_s(pub ::std::num::NonZeroU32);
 type amd_comgr_status_t = Result<(), self::amd_comgr_status_s>;
 impl amd_comgr_language_s {
    #[doc = " No high level language."]
    pub const AMD_COMGR_LANGUAGE_NONE: amd_comgr_language_s = amd_comgr_language_s(0);
 }
 impl amd_comgr_language_s {
    #[doc = " OpenCL 1.2."]
    pub const AMD_COMGR_LANGUAGE_OPENCL_1_2: amd_comgr_language_s = amd_comgr_language_s(1);
 }
 impl amd_comgr_language_s {
    #[doc = " OpenCL 2.0."]
    pub const AMD_COMGR_LANGUAGE_OPENCL_2_0: amd_comgr_language_s = amd_comgr_language_s(2);
 }
 impl amd_comgr_language_s {
    #[doc = " HIP."]
    pub const AMD_COMGR_LANGUAGE_HIP: amd_comgr_language_s = amd_comgr_language_s(3);
 }
 impl amd_comgr_language_s {
    #[doc = " LLVM IR, either textual (.ll) or bitcode (.bc) format."]
    pub const AMD_COMGR_LANGUAGE_LLVM_IR: amd_comgr_language_s = amd_comgr_language_s(4);
 }
 impl amd_comgr_language_s {
    #[doc = " Marker for last valid language."]
    pub const AMD_COMGR_LANGUAGE_LAST: amd_comgr_language_s = amd_comgr_language_s(4);
 }
 #[repr(transparent)]
 #[doc = " @brief The source languages supported by the compiler."]
 #[derive(Copy, Clone, Hash, PartialEq, Eq)]
 pub struct amd_comgr_language_s(pub ::std::os::raw::c_uint);
 #[doc = " @brief The source languages supported by the compiler."]
 pub use self::amd_comgr_language_s as amd_comgr_language_t;
 impl amd_comgr_data_kind_s {
    #[doc = " No data is available."]
    pub const AMD_COMGR_DATA_KIND_UNDEF: amd_comgr_data_kind_s = amd_comgr_data_kind_s(0);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a textual main source."]
    pub const AMD_COMGR_DATA_KIND_SOURCE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(1);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a textual source that is included in the main source\n or other include source."]
    pub const AMD_COMGR_DATA_KIND_INCLUDE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(2);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a precompiled-header source that is included in the main\n source or other include source."]
    pub const AMD_COMGR_DATA_KIND_PRECOMPILED_HEADER: amd_comgr_data_kind_s =
        amd_comgr_data_kind_s(3);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a diagnostic output."]
    pub const AMD_COMGR_DATA_KIND_DIAGNOSTIC: amd_comgr_data_kind_s = amd_comgr_data_kind_s(4);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a textual log output."]
    pub const AMD_COMGR_DATA_KIND_LOG: amd_comgr_data_kind_s = amd_comgr_data_kind_s(5);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is compiler LLVM IR bit code for a specific isa."]
    pub const AMD_COMGR_DATA_KIND_BC: amd_comgr_data_kind_s = amd_comgr_data_kind_s(6);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a relocatable machine code object for a specific isa."]
    pub const AMD_COMGR_DATA_KIND_RELOCATABLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(7);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is an executable machine code object for a specific\n isa. An executable is the kind of code object that can be loaded\n and executed."]
    pub const AMD_COMGR_DATA_KIND_EXECUTABLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(8);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a block of bytes."]
    pub const AMD_COMGR_DATA_KIND_BYTES: amd_comgr_data_kind_s = amd_comgr_data_kind_s(9);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a fat binary (clang-offload-bundler output)."]
    pub const AMD_COMGR_DATA_KIND_FATBIN: amd_comgr_data_kind_s = amd_comgr_data_kind_s(16);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is an archive."]
    pub const AMD_COMGR_DATA_KIND_AR: amd_comgr_data_kind_s = amd_comgr_data_kind_s(17);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is a bitcode bundle."]
    pub const AMD_COMGR_DATA_KIND_BC_BUNDLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(18);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is an archive bundle."]
    pub const AMD_COMGR_DATA_KIND_AR_BUNDLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(19);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is an object file bundle."]
    pub const AMD_COMGR_DATA_KIND_OBJ_BUNDLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(20);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " The data is SPIR-V IR"]
    pub const AMD_COMGR_DATA_KIND_SPIRV: amd_comgr_data_kind_s = amd_comgr_data_kind_s(21);
 }
 impl amd_comgr_data_kind_s {
    #[doc = " Marker for last valid data kind."]
    pub const AMD_COMGR_DATA_KIND_LAST: amd_comgr_data_kind_s = amd_comgr_data_kind_s(21);
 }
 #[repr(transparent)]
 #[doc = " @brief The kinds of data supported."]
 #[derive(Copy, Clone, Hash, PartialEq, Eq)]
 pub struct amd_comgr_data_kind_s(pub ::std::os::raw::c_uint);
 #[doc = " @brief The kinds of data supported."]
 pub use self::amd_comgr_data_kind_s as amd_comgr_data_kind_t;
 #[doc = " @brief A handle to a data object.\n\n Data objects are used to hold the data which is either an input or\n output of a code object manager action."]
 #[repr(C)]
 #[derive(Copy, Clone)]
 pub struct amd_comgr_data_s {
    pub handle: u64,
 }
 #[doc = " @brief A handle to a data object.\n\n Data objects are used to hold the data which is either an input or\n output of a code object manager action."]
 pub type amd_comgr_data_t = amd_comgr_data_s;
 #[doc = " @brief A handle to an action data object.\n\n An action data object holds a set of data objects. These can be\n used as inputs to an action, or produced as the result of an\n action."]
 #[repr(C)]
 #[derive(Copy, Clone)]
 pub struct amd_comgr_data_set_s {
    pub handle: u64,
 }
 #[doc = " @brief A handle to an action data object.\n\n An action data object holds a set of data objects. These can be\n used as inputs to an action, or produced as the result of an\n action."]
 pub type amd_comgr_data_set_t = amd_comgr_data_set_s;
 #[doc = " @brief A handle to an action information object.\n\n An action information object holds all the necessary information,\n excluding the input data objects, required to perform an action."]
 #[repr(C)]
 #[derive(Copy, Clone)]
 pub struct amd_comgr_action_info_s {
    pub handle: u64,
 }
 #[doc = " @brief A handle to an action information object.\n\n An action information object holds all the necessary information,\n excluding the input data objects, required to perform an action."]
 pub type amd_comgr_action_info_t = amd_comgr_action_info_s;
 impl amd_comgr_action_kind_s {
    #[doc = " Preprocess each source data object in @p input in order. For each\n successful preprocessor invocation, add a source data object to @p result.\n Resolve any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the working\n directory path in @p info. Preprocess the source for the language in @p\n info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any preprocessing fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
    pub const AMD_COMGR_ACTION_SOURCE_TO_PREPROCESSOR: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(0);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Copy all existing data objects in @p input to @p output, then add the\n device-specific and language-specific precompiled headers required for\n compilation.\n\n Currently the only supported languages are @p AMD_COMGR_LANGUAGE_OPENCL_1_2\n and @p AMD_COMGR_LANGUAGE_OPENCL_2_0.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name or language\n is not set in @p info, or the language is not supported."]
    pub const AMD_COMGR_ACTION_ADD_PRECOMPILED_HEADERS: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(1);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Compile each source data object in @p input in order. For each\n successful compilation add a bc data object to @p result. Resolve\n any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce bc for isa name in @p\n info. Compile the source for the language in @p info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
    pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(2);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Link a collection of bitcodes, bundled bitcodes, and bundled bitcode\n archives in @p into a single composite (unbundled) bitcode @p.\n Any device library bc data object must be explicitly added to @p input if\n needed.\n\n Return @p AMD_COMGR_STATUS_ERROR if the link or unbundling fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if IsaName is not set in @p info and does not match the isa name\n of all bc data objects in @p input, or if the Name field is not set for\n any DataObject in the input set."]
    pub const AMD_COMGR_ACTION_LINK_BC_TO_BC: amd_comgr_action_kind_s = amd_comgr_action_kind_s(3);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Perform code generation for each bc data object in @p input in\n order. For each successful code generation add a relocatable data\n object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any code\n generation fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all bc data objects in @p input."]
    pub const AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(4);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Perform code generation for each bc data object in @p input in\n order. For each successful code generation add an assembly source data\n object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any code\n generation fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all bc data objects in @p input."]
    pub const AMD_COMGR_ACTION_CODEGEN_BC_TO_ASSEMBLY: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(5);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Link each relocatable data object in @p input together and add\n the linked relocatable data object to @p result. Any device\n library relocatable data object must be explicitly added to @p\n input if needed.\n\n Return @p AMD_COMGR_STATUS_ERROR if the link fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."]
    pub const AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_RELOCATABLE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(6);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Link each relocatable data object in @p input together and add\n the linked executable data object to @p result. Any device\n library relocatable data object must be explicitly added to @p\n input if needed.\n\n Return @p AMD_COMGR_STATUS_ERROR if the link fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."]
    pub const AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(7);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Assemble each source data object in @p input in order into machine code.\n For each successful assembly add a relocatable data object to @p result.\n Resolve any include source names using the names of include data objects in\n @p input. Resolve any include relative path names using the working\n directory path in @p info. Produce relocatable for isa name in @p info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any assembly fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name is not set in\n @p info."]
    pub const AMD_COMGR_ACTION_ASSEMBLE_SOURCE_TO_RELOCATABLE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(8);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Disassemble each relocatable data object in @p input in\n order. For each successful disassembly add a source data object to\n @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any disassembly\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."]
    pub const AMD_COMGR_ACTION_DISASSEMBLE_RELOCATABLE_TO_SOURCE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(9);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Disassemble each executable data object in @p input in order. For\n each successful disassembly add a source data object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any disassembly\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."]
    pub const AMD_COMGR_ACTION_DISASSEMBLE_EXECUTABLE_TO_SOURCE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(10);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Disassemble each bytes data object in @p input in order. For each\n successful disassembly add a source data object to @p\n result. Only simple assembly language commands are generate that\n corresponf to raw bytes are supported, not any directives that\n control the code object layout, or symbolic branch targets or\n names.\n\n Return @p AMD_COMGR_STATUS_ERROR if any disassembly\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info"]
    pub const AMD_COMGR_ACTION_DISASSEMBLE_BYTES_TO_SOURCE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(11);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Compile each source data object in @p input in order. For each\n successful compilation add a bc data object to @p result. Resolve\n any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce bc for isa name in @p\n info. Compile the source for the language in @p info. Link against\n the device-specific and language-specific bitcode device libraries\n required for compilation.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
    pub const AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(12);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Compile a single source data object in @p input in order. For each\n successful compilation add a relocatable data object to @p result.\n Resolve any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce relocatable for hip name in @p\n info. Compile the source for the language in @p info. Link against\n the device-specific and language-specific bitcode device libraries\n required for compilation. Currently only supports HIP language.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
    pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_RELOCATABLE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(13);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Compile each source data object in @p input and create a single executabele\n in @p result. Resolve any include source names using the names of include\n data objects in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce executable for isa name in @p\n info. Compile the source for the language in @p info. Link against\n the device-specific and language-specific bitcode device libraries\n required for compilation.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
    pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_EXECUTABLE: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(14);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Unbundle each source data object in @p input. These objects can be\n bitcode bundles, or an archive containing bitcode bundles. For each\n successful unbundling, add a bc object or archive object to @p result,\n depending on the corresponding input.\n\n Return @p AMD_COMGR_STATUS_ERROR if any unbundling\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
    pub const AMD_COMGR_ACTION_UNBUNDLE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(15);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Translate each source SPIR-V object in @p input into LLVM IR Bitcode.\n For each successful translation, add a bc object to @p result   *\n\n Return @p AMD_COMGR_STATUS_ERROR if any translation fails\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if any input is not SPIR-V."]
    pub const AMD_COMGR_ACTION_TRANSLATE_SPIRV_TO_BC: amd_comgr_action_kind_s =
        amd_comgr_action_kind_s(19);
 }
 impl amd_comgr_action_kind_s {
    #[doc = " Marker for last valid action kind."]
    pub const AMD_COMGR_ACTION_LAST: amd_comgr_action_kind_s = amd_comgr_action_kind_s(19);
 }
 #[repr(transparent)]
 #[doc = " @brief The kinds of actions that can be performed."]
 #[derive(Copy, Clone, Hash, PartialEq, Eq)]
 pub struct amd_comgr_action_kind_s(pub ::std::os::raw::c_uint);
 #[doc = " @brief The kinds of actions that can be performed."]
 pub use self::amd_comgr_action_kind_s as amd_comgr_action_kind_t;
 pub struct Comgr3 {
    __library: ::libloading::Library,
    pub amd_comgr_get_version:
        Result<unsafe extern "C" fn(major: *mut usize, minor: *mut usize), ::libloading::Error>,
    pub amd_comgr_create_data: Result<
        unsafe extern "C" fn(
            kind: amd_comgr_data_kind_t,
            data: *mut amd_comgr_data_t,
        ) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_set_data: Result<
        unsafe extern "C" fn(
            data: amd_comgr_data_t,
            size: usize,
            bytes: *const ::std::os::raw::c_char,
        ) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_set_data_name: Result<
        unsafe extern "C" fn(
            data: amd_comgr_data_t,
            name: *const ::std::os::raw::c_char,
        ) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_get_data: Result<
        unsafe extern "C" fn(
            data: amd_comgr_data_t,
            size: *mut usize,
            bytes: *mut ::std::os::raw::c_char,
        ) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_create_data_set: Result<
        unsafe extern "C" fn(data_set: *mut amd_comgr_data_set_t) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_destroy_data_set: Result<
        unsafe extern "C" fn(data_set: amd_comgr_data_set_t) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_data_set_add: Result<
        unsafe extern "C" fn(
            data_set: amd_comgr_data_set_t,
            data: amd_comgr_data_t,
        ) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_action_data_get_data: Result<
        unsafe extern "C" fn(
            data_set: amd_comgr_data_set_t,
            data_kind: amd_comgr_data_kind_t,
            index: usize,
            data: *mut amd_comgr_data_t,
        ) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_create_action_info: Result<
        unsafe extern "C" fn(action_info: *mut amd_comgr_action_info_t) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_destroy_action_info: Result<
        unsafe extern "C" fn(action_info: amd_comgr_action_info_t) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_action_info_set_isa_name: Result<
        unsafe extern "C" fn(
            action_info: amd_comgr_action_info_t,
            isa_name: *const ::std::os::raw::c_char,
        ) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_action_info_set_language: Result<
        unsafe extern "C" fn(
            action_info: amd_comgr_action_info_t,
            language: amd_comgr_language_t,
        ) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_action_info_set_option_list: Result<
        unsafe extern "C" fn(
            action_info: amd_comgr_action_info_t,
            options: *mut *const ::std::os::raw::c_char,
            count: usize,
        ) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
    pub amd_comgr_do_action: Result<
        unsafe extern "C" fn(
            kind: amd_comgr_action_kind_t,
            info: amd_comgr_action_info_t,
            input: amd_comgr_data_set_t,
            result: amd_comgr_data_set_t,
        ) -> amd_comgr_status_t,
        ::libloading::Error,
    >,
 }
 impl Comgr3 {
    pub unsafe fn new<P>(path: P) -> Result<Self, ::libloading::Error>
    where
        P: AsRef<::std::ffi::OsStr>,
    {
        let library = ::libloading::Library::new(path)?;
        Self::from_library(library)
    }
    pub unsafe fn from_library<L>(library: L) -> Result<Self, ::libloading::Error>
    where
        L: Into<::libloading::Library>,
    {
        let __library = library.into();
        let amd_comgr_get_version = __library.get(b"amd_comgr_get_version\0").map(|sym| *sym);
        let amd_comgr_create_data = __library.get(b"amd_comgr_create_data\0").map(|sym| *sym);
        let amd_comgr_set_data = __library.get(b"amd_comgr_set_data\0").map(|sym| *sym);
        let amd_comgr_set_data_name = __library.get(b"amd_comgr_set_data_name\0").map(|sym| *sym);
        let amd_comgr_get_data = __library.get(b"amd_comgr_get_data\0").map(|sym| *sym);
        let amd_comgr_create_data_set = __library
            .get(b"amd_comgr_create_data_set\0")
            .map(|sym| *sym);
        let amd_comgr_destroy_data_set = __library
            .get(b"amd_comgr_destroy_data_set\0")
            .map(|sym| *sym);
        let amd_comgr_data_set_add = __library.get(b"amd_comgr_data_set_add\0").map(|sym| *sym);
        let amd_comgr_action_data_get_data = __library
            .get(b"amd_comgr_action_data_get_data\0")
            .map(|sym| *sym);
        let amd_comgr_create_action_info = __library
            .get(b"amd_comgr_create_action_info\0")
            .map(|sym| *sym);
        let amd_comgr_destroy_action_info = __library
            .get(b"amd_comgr_destroy_action_info\0")
            .map(|sym| *sym);
        let amd_comgr_action_info_set_isa_name = __library
            .get(b"amd_comgr_action_info_set_isa_name\0")
            .map(|sym| *sym);
        let amd_comgr_action_info_set_language = __library
            .get(b"amd_comgr_action_info_set_language\0")
            .map(|sym| *sym);
        let amd_comgr_action_info_set_option_list = __library
            .get(b"amd_comgr_action_info_set_option_list\0")
            .map(|sym| *sym);
        let amd_comgr_do_action = __library.get(b"amd_comgr_do_action\0").map(|sym| *sym);
        Ok(Comgr3 {
            __library,
            amd_comgr_get_version,
            amd_comgr_create_data,
            amd_comgr_set_data,
            amd_comgr_set_data_name,
            amd_comgr_get_data,
            amd_comgr_create_data_set,
            amd_comgr_destroy_data_set,
            amd_comgr_data_set_add,
            amd_comgr_action_data_get_data,
            amd_comgr_create_action_info,
            amd_comgr_destroy_action_info,
            amd_comgr_action_info_set_isa_name,
            amd_comgr_action_info_set_language,
            amd_comgr_action_info_set_option_list,
            amd_comgr_do_action,
        })
    }
    #[doc = " @brief Get the version of the code object manager interface\n supported.\n\n An interface is backwards compatible with an implementation with an\n equal major version, and a greater than or equal minor version.\n\n @param[out] major Major version number.\n\n @param[out] minor Minor version number."]
    pub unsafe fn amd_comgr_get_version(&self, major: *mut usize, minor: *mut usize) {
        (self
            .amd_comgr_get_version
            .as_ref()
            .expect("Expected function, got error."))(major, minor)
    }
    #[must_use]
    #[doc = " @brief Create a data object that can hold data of a specified kind.\n\n Data objects are reference counted and are destroyed when the\n reference count reaches 0. When a data object is created its\n reference count is 1, it has 0 bytes of data, it has an empty name,\n and it has no metadata.\n\n @param[in] kind The kind of data the object is intended to hold.\n\n @param[out] data A handle to the data object created. Its reference\n count is set to 1.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n kind is an invalid data kind, or @p\n AMD_COMGR_DATA_KIND_UNDEF. @p data is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to create the data object as out of resources."]
    pub unsafe fn amd_comgr_create_data(
        &self,
        kind: amd_comgr_data_kind_t,
        data: *mut amd_comgr_data_t,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_create_data
            .as_ref()
            .expect("Expected function, got error."))(kind, data)
    }
    #[must_use]
    #[doc = " @brief Set the data content of a data object to the specified\n bytes.\n\n Any previous value of the data object is overwritten. Any metadata\n associated with the data object is also replaced which invalidates\n all metadata handles to the old metadata.\n\n @param[in] data The data object to update.\n\n @param[in] size The number of bytes in the data specified by @p bytes.\n\n @param[in] bytes The bytes to set the data object to. The bytes are\n copied into the data object and can be freed after the call.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n data is an invalid data object, or has kind @p\n AMD_COMGR_DATA_KIND_UNDEF.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub unsafe fn amd_comgr_set_data(
        &self,
        data: amd_comgr_data_t,
        size: usize,
        bytes: *const ::std::os::raw::c_char,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_set_data
            .as_ref()
            .expect("Expected function, got error."))(data, size, bytes)
    }
    #[must_use]
    #[doc = " @brief Set the name associated with a data object.\n\n When compiling, the full name of an include directive is used to\n reference the contents of the include data object with the same\n name. The name may also be used for other data objects in log and\n diagnostic output.\n\n @param[in] data The data object to update.\n\n @param[in] name A null terminated string that specifies the name to\n use for the data object. If NULL then the name is set to the empty\n string.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n data is an invalid data object, or has kind @p\n AMD_COMGR_DATA_KIND_UNDEF.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub unsafe fn amd_comgr_set_data_name(
        &self,
        data: amd_comgr_data_t,
        name: *const ::std::os::raw::c_char,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_set_data_name
            .as_ref()
            .expect("Expected function, got error."))(data, name)
    }
    #[must_use]
    #[doc = " @brief Get the data contents, and/or the size of the data\n associated with a data object.\n\n @param[in] data The data object to query.\n\n @param[in, out] size On entry, the size of @p bytes. On return, if @p bytes\n is NULL, set to the size of the data object contents.\n\n @param[out] bytes If not NULL, then the first @p size bytes of the\n data object contents is copied. If NULL, no data is copied, and\n only @p size is updated (useful in order to find the size of buffer\n required to copy the data).\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n data is an invalid data object, or has kind @p\n AMD_COMGR_DATA_KIND_UNDEF. @p size is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub unsafe fn amd_comgr_get_data(
        &self,
        data: amd_comgr_data_t,
        size: *mut usize,
        bytes: *mut ::std::os::raw::c_char,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_get_data
            .as_ref()
            .expect("Expected function, got error."))(data, size, bytes)
    }
    #[must_use]
    #[doc = " @brief Create a data set object.\n\n @param[out] data_set A handle to the data set created. Initially it\n contains no data objects.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed\n successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data_set is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to create the data\n set object as out of resources."]
    pub unsafe fn amd_comgr_create_data_set(
        &self,
        data_set: *mut amd_comgr_data_set_t,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_create_data_set
            .as_ref()
            .expect("Expected function, got error."))(data_set)
    }
    #[must_use]
    #[doc = " @brief Destroy a data set object.\n\n The reference counts of any associated data objects are decremented. Any\n handles to the data set object become invalid.\n\n @param[in] data_set A handle to the data set object to destroy.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed\n successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data_set is an invalid\n data set object.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to update data set\n object as out of resources."]
    pub unsafe fn amd_comgr_destroy_data_set(
        &self,
        data_set: amd_comgr_data_set_t,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_destroy_data_set
            .as_ref()
            .expect("Expected function, got error."))(data_set)
    }
    #[must_use]
    #[doc = " @brief Add a data object to a data set object if it is not already added.\n\n The reference count of the data object is incremented.\n\n @param[in] data_set A handle to the data set object to be updated.\n\n @param[in] data A handle to the data object to be added. If @p data_set\n already has the specified handle present, then it is not added. The order\n that data objects are added is preserved.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed\n successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data_set is an invalid\n data set object. @p data is an invalid data object; has undef kind; has\n include kind but does not have a name.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to update data set\n object as out of resources."]
    pub unsafe fn amd_comgr_data_set_add(
        &self,
        data_set: amd_comgr_data_set_t,
        data: amd_comgr_data_t,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_data_set_add
            .as_ref()
            .expect("Expected function, got error."))(data_set, data)
    }
    #[must_use]
    #[doc = " @brief Return the Nth data object of a specified data kind that is added to a\n data set object.\n\n The reference count of the returned data object is incremented.\n\n @param[in] data_set A handle to the data set object to be queried.\n\n @param[in] data_kind The data kind of the data object to be returned.\n\n @param[in] index The index of the data object of data kind @data_kind to be\n returned. The first data object is index 0. The order of data objects matches\n the order that they were added to the data set object.\n\n @param[out] data The data object being requested.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed\n successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data_set is an invalid\n data set object. @p data_kind is an invalid data kind or @p\n AMD_COMGR_DATA_KIND_UNDEF. @p index is greater than the number of data\n objects of kind @p data_kind. @p data is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to query data set\n object as out of resources."]
    pub unsafe fn amd_comgr_action_data_get_data(
        &self,
        data_set: amd_comgr_data_set_t,
        data_kind: amd_comgr_data_kind_t,
        index: usize,
        data: *mut amd_comgr_data_t,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_action_data_get_data
            .as_ref()
            .expect("Expected function, got error."))(data_set, data_kind, index, data)
    }
    #[must_use]
    #[doc = " @brief Create an action info object.\n\n @param[out] action_info A handle to the action info object created.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n action_info is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to create the action info object as out of resources."]
    pub unsafe fn amd_comgr_create_action_info(
        &self,
        action_info: *mut amd_comgr_action_info_t,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_create_action_info
            .as_ref()
            .expect("Expected function, got error."))(action_info)
    }
    #[must_use]
    #[doc = " @brief Destroy an action info object.\n\n @param[in] action_info A handle to the action info object to destroy.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n action_info is an invalid action info object.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update action info object as out of resources."]
    pub unsafe fn amd_comgr_destroy_action_info(
        &self,
        action_info: amd_comgr_action_info_t,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_destroy_action_info
            .as_ref()
            .expect("Expected function, got error."))(action_info)
    }
    #[must_use]
    #[doc = " @brief Set the isa name of an action info object.\n\n When an action info object is created it has no isa name. Some\n actions require that the action info object has an isa name\n defined.\n\n @param[in] action_info A handle to the action info object to be\n updated.\n\n @param[in] isa_name A null terminated string that is the isa name. If NULL\n or the empty string then the isa name is cleared. The isa name is defined as\n the Code Object Target Identification string, described at\n https://llvm.org/docs/AMDGPUUsage.html#code-object-target-identification\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n action_info is an invalid action info object. @p isa_name is not an\n isa name supported by this version of the code object manager\n library.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update action info object as out of resources."]
    pub unsafe fn amd_comgr_action_info_set_isa_name(
        &self,
        action_info: amd_comgr_action_info_t,
        isa_name: *const ::std::os::raw::c_char,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_action_info_set_isa_name
            .as_ref()
            .expect("Expected function, got error."))(action_info, isa_name)
    }
    #[must_use]
    #[doc = " @brief Set the source language of an action info object.\n\n When an action info object is created it has no language defined\n which is represented by @p\n AMD_COMGR_LANGUAGE_NONE. Some actions require that\n the action info object has a source language defined.\n\n @param[in] action_info A handle to the action info object to be\n updated.\n\n @param[in] language The language to set. If @p\n AMD_COMGR_LANGUAGE_NONE then the language is cleared.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n action_info is an invalid action info object. @p language is an\n invalid language.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update action info object as out of resources."]
    pub unsafe fn amd_comgr_action_info_set_language(
        &self,
        action_info: amd_comgr_action_info_t,
        language: amd_comgr_language_t,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_action_info_set_language
            .as_ref()
            .expect("Expected function, got error."))(action_info, language)
    }
    #[must_use]
    #[doc = " @brief Set the options array of an action info object.\n\n This overrides any option strings or arrays previously set by calls to this\n function.\n\n An @p action_info object which had its options set with this function can\n only have its option inspected with @p\n amd_comgr_action_info_get_option_list_count and @p\n amd_comgr_action_info_get_option_list_item.\n\n @param[in] action_info A handle to the action info object to be updated.\n\n @param[in] options An array of null terminated strings. May be NULL if @p\n count is zero, which will result in an empty options array.\n\n @param[in] count The number of null terminated strings in @p options.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed\n successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p action_info is an\n invalid action info object, or @p options is NULL and @p count is non-zero.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to update action\n info object as out of resources."]
    pub unsafe fn amd_comgr_action_info_set_option_list(
        &self,
        action_info: amd_comgr_action_info_t,
        options: *mut *const ::std::os::raw::c_char,
        count: usize,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_action_info_set_option_list
            .as_ref()
            .expect("Expected function, got error."))(action_info, options, count)
    }
    #[must_use]
    #[doc = " @brief Perform an action.\n\n Each action ignores any data objects in @p input that it does not\n use. If logging is enabled in @info then @p result will have a log\n data object added. Any diagnostic data objects produced by the\n action will be added to @p result. See the description of each\n action in @p amd_comgr_action_kind_t.\n\n @param[in] kind The action to perform.\n\n @param[in] info The action info to use when performing the action.\n\n @param[in] input The input data objects to the @p kind action.\n\n @param[out] result Any data objects are removed before performing\n the action which then adds all data objects produced by the action.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR An error was\n reported when executing the action.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p\n kind is an invalid action kind. @p input_data or @p result_data are\n invalid action data object handles. See the description of each\n action in @p amd_comgr_action_kind_t for other\n conditions that result in this status.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to update the data object as out of resources."]
    pub unsafe fn amd_comgr_do_action(
        &self,
        kind: amd_comgr_action_kind_t,
        info: amd_comgr_action_info_t,
        input: amd_comgr_data_set_t,
        result: amd_comgr_data_set_t,
    ) -> amd_comgr_status_t {
        (self
            .amd_comgr_do_action
            .as_ref()
            .expect("Expected function, got error."))(kind, info, input, result)
    }
 }
--- a/ext/amd_comgr-sys/src/lib.rs
+++ b/ext/amd_comgr-sys/src/lib.rs
@ -1,3 +1,4 @@
-#![allow(warnings)]
+#[allow(warnings)]
-pub mod amd_comgr;
+pub mod comgr2;
-pub use amd_comgr::*;
+#[allow(warnings)]
 pub mod comgr3;
--- a/ext/hip_runtime-sys/build.rs
+++ b/ext/hip_runtime-sys/build.rs
@ -13,7 +13,7 @@ fn main() -> Result<(), VarError> {
            println!("cargo:rustc-link-search=native=C:\\Windows\\System32");
        };
    } else {
-        println!("cargo:rustc-link-lib=dylib=amdhip64");
+        println!("cargo:rustc-link-lib=dylib:+verbatim=libamdhip64.so.6");
        println!("cargo:rustc-link-search=native=/opt/rocm/lib/");
    }
    Ok(())
--- a/ext/hip_runtime-sys/src/lib.rs
+++ b/ext/hip_runtime-sys/src/lib.rs
--- a/format/Cargo.toml
+++ b/format/Cargo.toml
@ -0,0 +1,8 @@
 [package]
 name = "format"
 version = "0.0.0"
 edition = "2021"
 [dependencies]
 cuda_types = { path = "../cuda_types" }
 uuid = "1.16"
--- a/format/src/dark_api.rs
+++ b/format/src/dark_api.rs
@ -0,0 +1,40 @@
 use crate::CudaDisplay;
 use cuda_types::dark_api::*;
 impl CudaDisplay for FatbincWrapper {
    fn write(
        &self,
        _fn_name: &'static str,
        _index: usize,
        writer: &mut (impl std::io::Write + ?Sized),
    ) -> std::io::Result<()> {
        writer.write_all(b"{ magic: ")?;
        CudaDisplay::write(&self.magic, "", 0, writer)?;
        writer.write_all(b", version: ")?;
        CudaDisplay::write(&self.version, "", 0, writer)?;
        writer.write_all(b", data: ")?;
        CudaDisplay::write(&self.data, "", 0, writer)?;
        writer.write_all(b", filename_or_fatbins: ")?;
        CudaDisplay::write(&self.filename_or_fatbins, "", 0, writer)?;
        writer.write_all(b" }")
    }
 }
 impl CudaDisplay for FatbinHeader {
    fn write(
        &self,
        _fn_name: &'static str,
        _index: usize,
        writer: &mut (impl std::io::Write + ?Sized),
    ) -> std::io::Result<()> {
        writer.write_all(b"{ magic: ")?;
        CudaDisplay::write(&self.magic, "", 0, writer)?;
        writer.write_all(b", version: ")?;
        CudaDisplay::write(&self.version, "", 0, writer)?;
        writer.write_all(b", header_size: ")?;
        CudaDisplay::write(&self.header_size, "", 0, writer)?;
        writer.write_all(b", files_size: ")?;
        CudaDisplay::write(&self.files_size, "", 0, writer)?;
        writer.write_all(b" }")
    }
 }
--- a/zluda_dump/src/format_generated.rs
+++ b/zluda_dump/src/format_generated.rs
--- a/format/src/format_generated_blas.rs
+++ b/format/src/format_generated_blas.rs
--- a/format/src/format_generated_blaslt.rs
+++ b/format/src/format_generated_blaslt.rs
--- a/format/src/format_generated_blaslt_internal.rs
+++ b/format/src/format_generated_blaslt_internal.rs
--- a/format/src/format_generated_dnn9.rs
+++ b/format/src/format_generated_dnn9.rs
--- a/format/src/format_generated_fft.rs
+++ b/format/src/format_generated_fft.rs
--- a/format/src/format_generated_sparse.rs
+++ b/format/src/format_generated_sparse.rs
--- a/format/src/lib.rs
+++ b/format/src/lib.rs
--- a/geekbench.svg
+++ b/geekbench.svg
--- a/ptx/Cargo.toml
+++ b/ptx/Cargo.toml
@ -17,10 +17,20 @@ bitflags = "1.2"
 rustc-hash = "2.0.0"
 strum = "0.26"
 strum_macros = "0.26"
 petgraph = "0.7.1"
 microlp = "0.2.11"
 int-enum = "1.1"
 unwrap_or = "1.0.1"
 [dev-dependencies]
 hip_runtime-sys = { path = "../ext/hip_runtime-sys" }
 comgr = { path = "../comgr" }
 cuda_types = { path = "../cuda_types" }
 cuda_base = { path = "../cuda_base" }
 tempfile = "3"
 paste = "1.0"
-cuda-driver-sys = "0.3.0"
+pretty_assertions = "1.4.1"
 libloading = "0.8"
 [features]
 ci_build = []
--- a/ptx/src/pass/deparamize_functions.rs
+++ b/ptx/src/pass/deparamize_functions.rs
@ -2,8 +2,8 @@ use super::*;
 pub(super) fn run<'a, 'input>(
    resolver: &mut GlobalStringIdentResolver2<'input>,
-    directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
+    directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
-) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
+) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
    directives
        .into_iter()
        .map(|directive| run_directive(resolver, directive))
@ -12,8 +12,8 @@ pub(super) fn run<'a, 'input>(
 fn run_directive<'input>(
    resolver: &mut GlobalStringIdentResolver2,
-    directive: Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
+    directive: Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
-) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
+) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
    Ok(match directive {
        var @ Directive2::Variable(..) => var,
        Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?),
@ -22,13 +22,13 @@ fn run_directive<'input>(
 fn run_method<'input>(
    resolver: &mut GlobalStringIdentResolver2,
-    mut method: Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
+    mut method: Function2<ast::Instruction<SpirvWord>, SpirvWord>,
-) -> Result<Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
+) -> Result<Function2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
    let is_declaration = method.body.is_none();
    let mut body = Vec::new();
    let mut remap_returns = Vec::new();
-    if !method.func_decl.name.is_kernel() {
+    if !method.is_kernel {
-        for arg in method.func_decl.return_arguments.iter_mut() {
+        for arg in method.return_arguments.iter_mut() {
            match arg.state_space {
                ptx_parser::StateSpace::Param => {
                    arg.state_space = ptx_parser::StateSpace::Reg;
@ -51,7 +51,7 @@ fn run_method<'input>(
                _ => return Err(error_unreachable()),
            }
        }
-        for arg in method.func_decl.input_arguments.iter_mut() {
+        for arg in method.input_arguments.iter_mut() {
            match arg.state_space {
                ptx_parser::StateSpace::Param => {
                    arg.state_space = ptx_parser::StateSpace::Reg;
@ -95,14 +95,7 @@ fn run_method<'input>(
            Ok::<_, TranslateError>(body)
        })
        .transpose()?;
-    Ok(Function2 {
+    Ok(Function2 { body, ..method })
        func_decl: method.func_decl,
        globals: method.globals,
        body,
        import_as: method.import_as,
        tuning: method.tuning,
        linkage: method.linkage,
    })
 }
 fn run_statement<'input>(
--- a/ptx/src/pass/emit_llvm.rs
+++ b/ptx/src/pass/emit_llvm.rs
@ -36,6 +36,7 @@ use llvm_zluda::bit_writer::LLVMWriteBitcodeToMemoryBuffer;
 use llvm_zluda::{core::*, *};
 use llvm_zluda::{prelude::*, LLVMZludaBuildAtomicRMW};
 use llvm_zluda::{LLVMCallConv, LLVMZludaBuildAlloca};
 use ptx_parser::Mul24Control;
 const LLVM_UNNAMED: &CStr = c"";
 // https://llvm.org/docs/AMDGPUUsage.html#address-spaces
@ -65,17 +66,24 @@ impl Drop for Context {
    }
 }
-struct Module(LLVMModuleRef);
+pub struct Module(LLVMModuleRef, Context);
 impl Module {
-    fn new(ctx: &Context, name: &CStr) -> Self {
+    fn new(ctx: Context, name: &CStr) -> Self {
-        Self(unsafe { LLVMModuleCreateWithNameInContext(name.as_ptr(), ctx.get()) })
+        Self(
            unsafe { LLVMModuleCreateWithNameInContext(name.as_ptr(), ctx.get()) },
            ctx,
        )
    }
    fn get(&self) -> LLVMModuleRef {
        self.0
    }
    fn context(&self) -> &Context {
        &self.1
    }
    fn verify(&self) -> Result<(), Message> {
        let mut err = ptr::null_mut();
        let error = unsafe {
@ -92,10 +100,15 @@ impl Module {
        }
    }
-    fn write_bitcode_to_memory(&self) -> MemoryBuffer {
+    pub fn write_bitcode_to_memory(&self) -> MemoryBuffer {
        let memory_buffer = unsafe { LLVMWriteBitcodeToMemoryBuffer(self.get()) };
        MemoryBuffer(memory_buffer)
    }
    pub fn print_module_to_string(&self) -> Message {
        let asm = unsafe { LLVMPrintModuleToString(self.get()) };
        Message(unsafe { CStr::from_ptr(asm) })
    }
 }
 impl Drop for Module {
@ -130,7 +143,7 @@ impl Drop for Builder {
    }
 }
-struct Message(&'static CStr);
+pub struct Message(&'static CStr);
 impl Drop for Message {
    fn drop(&mut self) {
@ -146,6 +159,12 @@ impl std::fmt::Debug for Message {
    }
 }
 impl Message {
    pub fn to_str(&self) -> &str {
        self.0.to_str().unwrap().trim()
    }
 }
 pub struct MemoryBuffer(LLVMMemoryBufferRef);
 impl Drop for MemoryBuffer {
@ -168,11 +187,11 @@ impl Deref for MemoryBuffer {
 pub(super) fn run<'input>(
    id_defs: GlobalStringIdentResolver2<'input>,
-    directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
+    directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
-) -> Result<MemoryBuffer, TranslateError> {
+) -> Result<Module, TranslateError> {
    let context = Context::new();
-    let module = Module::new(&context, LLVM_UNNAMED);
+    let module = Module::new(context, LLVM_UNNAMED);
-    let mut emit_ctx = ModuleEmitContext::new(&context, &module, &id_defs);
+    let mut emit_ctx = ModuleEmitContext::new(&module, &id_defs);
    for directive in directives {
        match directive {
            Directive2::Variable(linking, variable) => emit_ctx.emit_global(linking, variable)?,
@ -182,7 +201,7 @@ pub(super) fn run<'input>(
    if let Err(err) = module.verify() {
        panic!("{:?}", err);
    }
-    Ok(module.write_bitcode_to_memory())
+    Ok(module)
 }
 struct ModuleEmitContext<'a, 'input> {
@ -194,11 +213,8 @@ struct ModuleEmitContext<'a, 'input> {
 }
 impl<'a, 'input> ModuleEmitContext<'a, 'input> {
-    fn new(
+    fn new(module: &Module, id_defs: &'a GlobalStringIdentResolver2<'input>) -> Self {
-        context: &Context,
+        let context = module.context();
        module: &Module,
        id_defs: &'a GlobalStringIdentResolver2<'input>,
    ) -> Self {
        ModuleEmitContext {
            context: context.get(),
            module: module.get(),
@ -218,24 +234,20 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
    fn emit_method(
        &mut self,
-        method: Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
+        method: Function2<ast::Instruction<SpirvWord>, SpirvWord>,
    ) -> Result<(), TranslateError> {
        let func_decl = method.func_decl;
        let name = method
            .import_as
            .as_deref()
-            .or_else(|| match func_decl.name {
+            .or_else(|| self.id_defs.ident_map[&method.name].name.as_deref())
                ast::MethodName::Kernel(name) => Some(name),
                ast::MethodName::Func(id) => self.id_defs.ident_map[&id].name.as_deref(),
            })
            .ok_or_else(|| error_unreachable())?;
        let name = CString::new(name).map_err(|_| error_unreachable())?;
        let mut fn_ = unsafe { LLVMGetNamedFunction(self.module, name.as_ptr()) };
        if fn_ == ptr::null_mut() {
            let fn_type = get_function_type(
                self.context,
-                func_decl.return_arguments.iter().map(|v| &v.v_type),
+                method.return_arguments.iter().map(|v| &v.v_type),
-                func_decl
+                method
                    .input_arguments
                    .iter()
                    .map(|v| get_input_argument_type(self.context, &v.v_type, v.state_space)),
@ -245,15 +257,28 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
            self.emit_fn_attribute(fn_, "uniform-work-group-size", "true");
            self.emit_fn_attribute(fn_, "no-trapping-math", "true");
        }
-        if let ast::MethodName::Func(name) = func_decl.name {
+        if !method.is_kernel {
-            self.resolver.register(name, fn_);
+            self.resolver.register(method.name, fn_);
            self.emit_fn_attribute(fn_, "denormal-fp-math-f32", "dynamic");
            self.emit_fn_attribute(fn_, "denormal-fp-math", "dynamic");
        } else {
            self.emit_fn_attribute(
                fn_,
                "denormal-fp-math-f32",
                llvm_ftz(method.flush_to_zero_f32),
            );
            self.emit_fn_attribute(
                fn_,
                "denormal-fp-math",
                llvm_ftz(method.flush_to_zero_f16f64),
            );
        }
-        for (i, param) in func_decl.input_arguments.iter().enumerate() {
+        for (i, param) in method.input_arguments.iter().enumerate() {
            let value = unsafe { LLVMGetParam(fn_, i as u32) };
            let name = self.resolver.get_or_add(param.name);
            unsafe { LLVMSetValueName2(value, name.as_ptr().cast(), name.len()) };
            self.resolver.register(param.name, value);
-            if func_decl.name.is_kernel() {
+            if method.is_kernel {
                let attr_kind = unsafe {
                    LLVMGetEnumAttributeKindForName(b"byref".as_ptr().cast(), b"byref".len())
                };
@ -267,7 +292,7 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
                unsafe { LLVMAddAttributeAtIndex(fn_, i as u32 + 1, attr) };
            }
        }
-        let call_conv = if func_decl.name.is_kernel() {
+        let call_conv = if method.is_kernel {
            Self::kernel_call_convention()
        } else {
            Self::func_call_convention()
@ -282,7 +307,7 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
                unsafe { LLVMAppendBasicBlockInContext(self.context, fn_, LLVM_UNNAMED.as_ptr()) };
            unsafe { LLVMPositionBuilderAtEnd(self.builder.get(), real_bb) };
            let mut method_emitter = MethodEmitContext::new(self, fn_, variables_builder);
-            for var in func_decl.return_arguments {
+            for var in method.return_arguments {
                method_emitter.emit_variable(var)?;
            }
            for statement in statements.iter() {
@ -290,6 +315,17 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
                    method_emitter.emit_label_initial(*label);
                }
            }
            let mut statements = statements.into_iter();
            if let Some(Statement::Label(label)) = statements.next() {
                method_emitter.emit_label_delayed(label)?;
            } else {
                return Err(error_unreachable());
            }
            method_emitter.emit_kernel_rounding_prelude(
                method.is_kernel,
                method.rounding_mode_f32,
                method.rounding_mode_f16f64,
            )?;
            for statement in statements {
                method_emitter.emit_statement(statement)?;
            }
@ -417,6 +453,14 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
    }
 }
 fn llvm_ftz(ftz: bool) -> &'static str {
    if ftz {
        "preserve-sign"
    } else {
        "ieee"
    }
 }
 fn get_input_argument_type(
    context: LLVMContextRef,
    v_type: &ast::Type,
@ -473,9 +517,33 @@ impl<'a> MethodEmitContext<'a> {
            Statement::FunctionPointer(_) => todo!(),
            Statement::VectorRead(vector_read) => self.emit_vector_read(vector_read)?,
            Statement::VectorWrite(vector_write) => self.emit_vector_write(vector_write)?,
            Statement::SetMode(mode_reg) => self.emit_set_mode(mode_reg)?,
            Statement::FpSaturate { dst, src, type_ } => self.emit_fp_saturate(type_, dst, src)?,
        })
    }
    // This should be a kernel attribute, but sadly AMDGPU LLVM target does
    // not support attribute for it. So we have to set it as the first
    // instruction in the body of a kernel
    fn emit_kernel_rounding_prelude(
        &mut self,
        is_kernel: bool,
        rounding_mode_f32: ast::RoundingMode,
        rounding_mode_f16f64: ast::RoundingMode,
    ) -> Result<(), TranslateError> {
        if is_kernel {
            if rounding_mode_f32 != ast::RoundingMode::NearestEven
                || rounding_mode_f16f64 != ast::RoundingMode::NearestEven
            {
                self.emit_set_mode(ModeRegister::Rounding {
                    f32: rounding_mode_f32,
                    f16f64: rounding_mode_f16f64,
                })?;
            }
        }
        Ok(())
    }
    fn emit_variable(&mut self, var: ast::Variable<SpirvWord>) -> Result<(), TranslateError> {
        let alloca = unsafe {
            LLVMZludaBuildAlloca(
@ -523,11 +591,12 @@ impl<'a> MethodEmitContext<'a> {
        inst: ast::Instruction<SpirvWord>,
    ) -> Result<(), TranslateError> {
        match inst {
-            ast::Instruction::Mov { data, arguments } => self.emit_mov(data, arguments),
+            ast::Instruction::Mov { data: _, arguments } => self.emit_mov(arguments),
            ast::Instruction::Ld { data, arguments } => self.emit_ld(data, arguments),
            ast::Instruction::Add { data, arguments } => self.emit_add(data, arguments),
            ast::Instruction::St { data, arguments } => self.emit_st(data, arguments),
            ast::Instruction::Mul { data, arguments } => self.emit_mul(data, arguments),
            ast::Instruction::Mul24 { data, arguments } => self.emit_mul24(data, arguments),
            ast::Instruction::Setp { data, arguments } => self.emit_setp(data, arguments),
            ast::Instruction::SetpBool { .. } => todo!(),
            ast::Instruction::Not { data, arguments } => self.emit_not(data, arguments),
@ -768,7 +837,13 @@ impl<'a> MethodEmitContext<'a> {
        let src1 = self.resolver.value(arguments.src1)?;
        let src2 = self.resolver.value(arguments.src2)?;
        let fn_ = match data {
-            ast::ArithDetails::Integer(..) => LLVMBuildAdd,
+            ast::ArithDetails::Integer(ast::ArithInteger {
                saturate: true,
                type_,
            }) => return self.emit_add_sat(type_, arguments),
            ast::ArithDetails::Integer(ast::ArithInteger {
                saturate: false, ..
            }) => LLVMBuildAdd,
            ast::ArithDetails::Float(..) => LLVMBuildFAdd,
        };
        self.resolver.with_result(arguments.dst, |dst| unsafe {
@ -849,11 +924,7 @@ impl<'a> MethodEmitContext<'a> {
        Ok(())
    }
-    fn emit_mov(
+    fn emit_mov(&mut self, arguments: ast::MovArgs<SpirvWord>) -> Result<(), TranslateError> {
        &mut self,
        _data: ast::MovDetails,
        arguments: ast::MovArgs<SpirvWord>,
    ) -> Result<(), TranslateError> {
        self.resolver
            .register(arguments.dst, self.resolver.value(arguments.src)?);
        Ok(())
@ -1128,7 +1199,7 @@ impl<'a> MethodEmitContext<'a> {
        let cos = self.emit_intrinsic(
            c"llvm.cos.f32",
            Some(arguments.dst),
-            &ast::ScalarType::F32.into(),
+            Some(&ast::ScalarType::F32.into()),
            vec![(self.resolver.value(arguments.src)?, llvm_f32)],
        )?;
        unsafe { LLVMZludaSetFastMathFlags(cos, LLVMZludaFastMathApproxFunc) }
@ -1381,7 +1452,7 @@ impl<'a> MethodEmitContext<'a> {
        let sin = self.emit_intrinsic(
            c"llvm.sin.f32",
            Some(arguments.dst),
-            &ast::ScalarType::F32.into(),
+            Some(&ast::ScalarType::F32.into()),
            vec![(self.resolver.value(arguments.src)?, llvm_f32)],
        )?;
        unsafe { LLVMZludaSetFastMathFlags(sin, LLVMZludaFastMathApproxFunc) }
@ -1392,12 +1463,12 @@ impl<'a> MethodEmitContext<'a> {
        &mut self,
        name: &CStr,
        dst: Option<SpirvWord>,
-        return_type: &ast::Type,
+        return_type: Option<&ast::Type>,
        arguments: Vec<(LLVMValueRef, LLVMTypeRef)>,
    ) -> Result<LLVMValueRef, TranslateError> {
        let fn_type = get_function_type(
            self.context,
-            iter::once(return_type),
+            return_type.into_iter(),
            arguments.iter().map(|(_, type_)| Ok(*type_)),
        )?;
        let mut fn_ = unsafe { LLVMGetNamedFunction(self.module, name.as_ptr()) };
@ -1544,32 +1615,40 @@ impl<'a> MethodEmitContext<'a> {
            ptx_parser::CvtMode::SignExtend => LLVMBuildSExt,
            ptx_parser::CvtMode::Truncate => LLVMBuildTrunc,
            ptx_parser::CvtMode::Bitcast => LLVMBuildBitCast,
-            ptx_parser::CvtMode::SaturateUnsignedToSigned => {
+            ptx_parser::CvtMode::IntSaturateToSigned => {
                return self.emit_cvt_unsigned_to_signed_sat(data.from, data.to, arguments)
            }
-            ptx_parser::CvtMode::SaturateSignedToUnsigned => {
+            ptx_parser::CvtMode::IntSaturateToUnsigned => {
                return self.emit_cvt_signed_to_unsigned_sat(data.from, data.to, arguments)
            }
            ptx_parser::CvtMode::FPExtend { .. } => LLVMBuildFPExt,
            ptx_parser::CvtMode::FPTruncate { .. } => LLVMBuildFPTrunc,
            ptx_parser::CvtMode::FPRound {
-                integer_rounding, ..
+                integer_rounding: None,
                flush_to_zero: None | Some(false),
                ..
            } => {
-                return self.emit_cvt_float_to_int(
+                return self.emit_mov(ast::MovArgs {
-                    data.from,
+                    dst: arguments.dst,
-                    data.to,
+                    src: arguments.src,
-                    integer_rounding.unwrap_or(ast::RoundingMode::NearestEven),
+                })
                    arguments,
                    Some(LLVMBuildFPToSI),
                )
            }
            ptx_parser::CvtMode::FPRound {
                integer_rounding: None,
                flush_to_zero: Some(true),
                ..
            } => return self.flush_denormals(data.to, arguments.src, arguments.dst),
            ptx_parser::CvtMode::FPRound {
                integer_rounding: Some(rounding),
                ..
            } => return self.emit_cvt_float_to_int(data.from, data.to, rounding, arguments, None),
            ptx_parser::CvtMode::SignedFromFP { rounding, .. } => {
                return self.emit_cvt_float_to_int(
                    data.from,
                    data.to,
                    rounding,
                    arguments,
-                    Some(LLVMBuildFPToSI),
+                    Some(true),
                )
            }
            ptx_parser::CvtMode::UnsignedFromFP { rounding, .. } => {
@ -1578,13 +1657,13 @@ impl<'a> MethodEmitContext<'a> {
                    data.to,
                    rounding,
                    arguments,
-                    Some(LLVMBuildFPToUI),
+                    Some(false),
                )
            }
-            ptx_parser::CvtMode::FPFromSigned(_) => {
+            ptx_parser::CvtMode::FPFromSigned { .. } => {
                return self.emit_cvt_int_to_float(data.to, arguments, LLVMBuildSIToFP)
            }
-            ptx_parser::CvtMode::FPFromUnsigned(_) => {
+            ptx_parser::CvtMode::FPFromUnsigned { .. } => {
                return self.emit_cvt_int_to_float(data.to, arguments, LLVMBuildUIToFP)
            }
        };
@ -1601,27 +1680,7 @@ impl<'a> MethodEmitContext<'a> {
        to: ptx_parser::ScalarType,
        arguments: ptx_parser::CvtArgs<SpirvWord>,
    ) -> Result<(), TranslateError> {
-        // This looks dodgy, but it's fine. MAX bit pattern is always 0b11..1,
+        let clamped = self.emit_saturate_integer(from, to, &arguments)?;
        // so if it's downcast to a smaller type, it will be the maximum value
        // of the smaller type
        let max_value = match to {
            ptx_parser::ScalarType::S8 => i8::MAX as u64,
            ptx_parser::ScalarType::S16 => i16::MAX as u64,
            ptx_parser::ScalarType::S32 => i32::MAX as u64,
            ptx_parser::ScalarType::S64 => i64::MAX as u64,
            _ => return Err(error_unreachable()),
        };
        let from_llvm = get_scalar_type(self.context, from);
        let max = unsafe { LLVMConstInt(from_llvm, max_value, 0) };
        let clamped = self.emit_intrinsic(
            c"llvm.umin",
            None,
            &from.into(),
            vec![
                (self.resolver.value(arguments.src)?, from_llvm),
                (max, from_llvm),
            ],
        )?;
        let resize_fn = if to.layout().size() >= from.layout().size() {
            LLVMBuildSExtOrBitCast
        } else {
@ -1634,40 +1693,92 @@ impl<'a> MethodEmitContext<'a> {
        Ok(())
    }
    fn emit_saturate_integer(
        &mut self,
        from: ptx_parser::ScalarType,
        to: ptx_parser::ScalarType,
        arguments: &ptx_parser::CvtArgs<SpirvWord>,
    ) -> Result<LLVMValueRef, TranslateError> {
        let from_llvm = get_scalar_type(self.context, from);
        match from.kind() {
            ptx_parser::ScalarKind::Unsigned => {
                let max_value = match to {
                    ptx_parser::ScalarType::U8 => u8::MAX as u64,
                    ptx_parser::ScalarType::S8 => i8::MAX as u64,
                    ptx_parser::ScalarType::U16 => u16::MAX as u64,
                    ptx_parser::ScalarType::S16 => i16::MAX as u64,
                    ptx_parser::ScalarType::U32 => u32::MAX as u64,
                    ptx_parser::ScalarType::S32 => i32::MAX as u64,
                    ptx_parser::ScalarType::U64 => u64::MAX as u64,
                    ptx_parser::ScalarType::S64 => i64::MAX as u64,
                    _ => return Err(error_unreachable()),
                };
                let intrinsic = format!("llvm.umin.{}\0", LLVMTypeDisplay(from));
                let max = unsafe { LLVMConstInt(from_llvm, max_value, 0) };
                let clamped = self.emit_intrinsic(
                    unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
                    None,
                    Some(&from.into()),
                    vec![
                        (self.resolver.value(arguments.src)?, from_llvm),
                        (max, from_llvm),
                    ],
                )?;
                Ok(clamped)
            }
            ptx_parser::ScalarKind::Signed => {
                let (min_value_from, max_value_from) = match from {
                    ptx_parser::ScalarType::S8 => (i8::MIN as i128, i8::MAX as i128),
                    ptx_parser::ScalarType::S16 => (i16::MIN as i128, i16::MAX as i128),
                    ptx_parser::ScalarType::S32 => (i32::MIN as i128, i32::MAX as i128),
                    ptx_parser::ScalarType::S64 => (i64::MIN as i128, i64::MAX as i128),
                    _ => return Err(error_unreachable()),
                };
                let (min_value_to, max_value_to) = match to {
                    ptx_parser::ScalarType::U8 => (u8::MIN as i128, u8::MAX as i128),
                    ptx_parser::ScalarType::S8 => (i8::MIN as i128, i8::MAX as i128),
                    ptx_parser::ScalarType::U16 => (u16::MIN as i128, u16::MAX as i128),
                    ptx_parser::ScalarType::S16 => (i16::MIN as i128, i16::MAX as i128),
                    ptx_parser::ScalarType::U32 => (u32::MIN as i128, u32::MAX as i128),
                    ptx_parser::ScalarType::S32 => (i32::MIN as i128, i32::MAX as i128),
                    ptx_parser::ScalarType::U64 => (u64::MIN as i128, u64::MAX as i128),
                    ptx_parser::ScalarType::S64 => (i64::MIN as i128, i64::MAX as i128),
                    _ => return Err(error_unreachable()),
                };
                let min_value = min_value_from.max(min_value_to);
                let max_value = max_value_from.min(max_value_to);
                let max_intrinsic = format!("llvm.smax.{}\0", LLVMTypeDisplay(from));
                let min = unsafe { LLVMConstInt(from_llvm, min_value as u64, 1) };
                let min_intrinsic = format!("llvm.smin.{}\0", LLVMTypeDisplay(from));
                let max = unsafe { LLVMConstInt(from_llvm, max_value as u64, 1) };
                let clamped = self.emit_intrinsic(
                    unsafe { CStr::from_bytes_with_nul_unchecked(max_intrinsic.as_bytes()) },
                    None,
                    Some(&from.into()),
                    vec![
                        (self.resolver.value(arguments.src)?, from_llvm),
                        (min, from_llvm),
                    ],
                )?;
                let clamped = self.emit_intrinsic(
                    unsafe { CStr::from_bytes_with_nul_unchecked(min_intrinsic.as_bytes()) },
                    None,
                    Some(&from.into()),
                    vec![(clamped, from_llvm), (max, from_llvm)],
                )?;
                Ok(clamped)
            }
            _ => return Err(error_unreachable()),
        }
    }
    fn emit_cvt_signed_to_unsigned_sat(
        &mut self,
        from: ptx_parser::ScalarType,
        to: ptx_parser::ScalarType,
        arguments: ptx_parser::CvtArgs<SpirvWord>,
    ) -> Result<(), TranslateError> {
-        let from_llvm = get_scalar_type(self.context, from);
+        let clamped = self.emit_saturate_integer(from, to, &arguments)?;
        let zero = unsafe { LLVMConstInt(from_llvm, 0, 0) };
        let zero_clamp_intrinsic = format!("llvm.smax.{}\0", LLVMTypeDisplay(from));
        let zero_clamped = self.emit_intrinsic(
            unsafe { CStr::from_bytes_with_nul_unchecked(zero_clamp_intrinsic.as_bytes()) },
            None,
            &from.into(),
            vec![
                (self.resolver.value(arguments.src)?, from_llvm),
                (zero, from_llvm),
            ],
        )?;
        // zero_clamped is now unsigned
        let max_value = match to {
            ptx_parser::ScalarType::U8 => u8::MAX as u64,
            ptx_parser::ScalarType::U16 => u16::MAX as u64,
            ptx_parser::ScalarType::U32 => u32::MAX as u64,
            ptx_parser::ScalarType::U64 => u64::MAX as u64,
            _ => return Err(error_unreachable()),
        };
        let max = unsafe { LLVMConstInt(from_llvm, max_value, 0) };
        let max_clamp_intrinsic = format!("llvm.umin.{}\0", LLVMTypeDisplay(from));
        let fully_clamped = self.emit_intrinsic(
            unsafe { CStr::from_bytes_with_nul_unchecked(max_clamp_intrinsic.as_bytes()) },
            None,
            &from.into(),
            vec![(zero_clamped, from_llvm), (max, from_llvm)],
        )?;
        let resize_fn = if to.layout().size() >= from.layout().size() {
            LLVMBuildZExtOrBitCast
        } else {
@ -1675,7 +1786,7 @@ impl<'a> MethodEmitContext<'a> {
        };
        let to_llvm = get_scalar_type(self.context, to);
        self.resolver.with_result(arguments.dst, |dst| unsafe {
-            resize_fn(self.builder, fully_clamped, to_llvm, dst)
+            resize_fn(self.builder, clamped, to_llvm, dst)
        });
        Ok(())
    }
@ -1686,18 +1797,89 @@ impl<'a> MethodEmitContext<'a> {
        to: ast::ScalarType,
        rounding: ast::RoundingMode,
        arguments: ptx_parser::CvtArgs<SpirvWord>,
-        llvm_cast: Option<
+        signed_cast: Option<bool>,
            unsafe extern "C" fn(
                arg1: LLVMBuilderRef,
                Val: LLVMValueRef,
                DestTy: LLVMTypeRef,
                Name: *const i8,
            ) -> LLVMValueRef,
        >,
    ) -> Result<(), TranslateError> {
        let dst_int_rounded =
            self.emit_fp_int_rounding(from, rounding, &arguments, signed_cast.is_some())?;
        // In PTX all the int-from-float casts are saturating casts. On the other hand, in LLVM,
        // out-of-range fptoui and fptosi have undefined behavior.
        // We could handle this all with llvm.fptosi.sat and llvm.fptoui.sat intrinsics, but
        // the problem is that, when using *.sat variants AMDGPU target _always_ emits saturation
        // checks. Often they are unnecessary because v_cvt_* instructions saturates anyway.
        // For that reason, all from-to combinations that we know have a direct corresponding
        // v_cvt_* instruction get special treatment
        let is_saturating_cast = match (to, from) {
            (ast::ScalarType::S16, ast::ScalarType::F16)
            | (ast::ScalarType::S32, ast::ScalarType::F32)
            | (ast::ScalarType::S32, ast::ScalarType::F64)
            | (ast::ScalarType::U16, ast::ScalarType::F16)
            | (ast::ScalarType::U32, ast::ScalarType::F32)
            | (ast::ScalarType::U32, ast::ScalarType::F64) => true,
            _ => false,
        };
        let signed_cast = match signed_cast {
            Some(s) => s,
            None => {
                self.resolver.register(
                    arguments.dst,
                    dst_int_rounded.ok_or_else(error_unreachable)?,
                );
                return Ok(());
            }
        };
        if is_saturating_cast {
            let to = get_scalar_type(self.context, to);
            let src =
                dst_int_rounded.unwrap_or_else(|| self.resolver.value(arguments.src).unwrap());
            let llvm_cast = if signed_cast {
                LLVMBuildFPToSI
            } else {
                LLVMBuildFPToUI
            };
            let poisoned_dst = unsafe { llvm_cast(self.builder, src, to, LLVM_UNNAMED.as_ptr()) };
            self.resolver.with_result(arguments.dst, |dst| unsafe {
                LLVMBuildFreeze(self.builder, poisoned_dst, dst)
            });
        } else {
            let cvt_op = if to.kind() == ptx_parser::ScalarKind::Unsigned {
                "fptoui"
            } else {
                "fptosi"
            };
            let cast_intrinsic = format!(
                "llvm.{cvt_op}.sat.{}.{}\0",
                LLVMTypeDisplay(to),
                LLVMTypeDisplay(from)
            );
            let src =
                dst_int_rounded.unwrap_or_else(|| self.resolver.value(arguments.src).unwrap());
            self.emit_intrinsic(
                unsafe { CStr::from_bytes_with_nul_unchecked(cast_intrinsic.as_bytes()) },
                Some(arguments.dst),
                Some(&to.into()),
                vec![(src, get_scalar_type(self.context, from))],
            )?;
        }
        Ok(())
    }
    fn emit_fp_int_rounding(
        &mut self,
        from: ptx_parser::ScalarType,
        rounding: ptx_parser::RoundingMode,
        arguments: &ptx_parser::CvtArgs<SpirvWord>,
        will_saturate_with_cvt: bool,
    ) -> Result<Option<LLVMValueRef>, TranslateError> {
        let prefix = match rounding {
            ptx_parser::RoundingMode::NearestEven => "llvm.roundeven",
-            ptx_parser::RoundingMode::Zero => "llvm.trunc",
+            ptx_parser::RoundingMode::Zero => {
                // cvt has round-to-zero semantics
                if will_saturate_with_cvt {
                    return Ok(None);
                } else {
                    "llvm.trunc"
                }
            }
            ptx_parser::RoundingMode::NegativeInf => "llvm.floor",
            ptx_parser::RoundingMode::PositiveInf => "llvm.ceil",
        };
@ -1705,40 +1887,13 @@ impl<'a> MethodEmitContext<'a> {
        let rounded_float = self.emit_intrinsic(
            unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
            None,
-            &from.into(),
+            Some(&from.into()),
            vec![(
                self.resolver.value(arguments.src)?,
                get_scalar_type(self.context, from),
            )],
        )?;
-        if let Some(llvm_cast) = llvm_cast {
+        Ok(Some(rounded_float))
            let to = get_scalar_type(self.context, to);
            let poisoned_dst =
                unsafe { llvm_cast(self.builder, rounded_float, to, LLVM_UNNAMED.as_ptr()) };
            self.resolver.with_result(arguments.dst, |dst| unsafe {
                LLVMBuildFreeze(self.builder, poisoned_dst, dst)
            });
        } else {
            self.resolver.register(arguments.dst, rounded_float);
        }
        // Using explicit saturation gives us worse codegen: it explicitly checks for out of bound
        // values and NaNs. Using non-saturated fptosi/fptoui emits v_cvt_<TO>_<FROM> which
        // saturates by default and we don't care about NaNs anyway
        /*
        let cast_intrinsic = format!(
            "{}.{}.{}\0",
            llvm_cast,
            LLVMTypeDisplay(to),
            LLVMTypeDisplay(from)
        );
        self.emit_intrinsic(
            unsafe { CStr::from_bytes_with_nul_unchecked(cast_intrinsic.as_bytes()) },
            Some(arguments.dst),
            &to.into(),
            vec![(rounded_float, get_scalar_type(self.context, from))],
        )?;
        */
        Ok(())
    }
    fn emit_cvt_int_to_float(
@ -1774,7 +1929,7 @@ impl<'a> MethodEmitContext<'a> {
        self.emit_intrinsic(
            intrinsic,
            Some(arguments.dst),
-            &data.type_.into(),
+            Some(&data.type_.into()),
            vec![(self.resolver.value(arguments.src)?, type_)],
        )?;
        Ok(())
@ -1795,7 +1950,7 @@ impl<'a> MethodEmitContext<'a> {
        self.emit_intrinsic(
            intrinsic,
            Some(arguments.dst),
-            &data.type_.into(),
+            Some(&data.type_.into()),
            vec![(self.resolver.value(arguments.src)?, type_)],
        )?;
        Ok(())
@ -1817,7 +1972,7 @@ impl<'a> MethodEmitContext<'a> {
        self.emit_intrinsic(
            intrinsic,
            Some(arguments.dst),
-            &data.type_.into(),
+            Some(&data.type_.into()),
            vec![(self.resolver.value(arguments.src)?, type_)],
        )?;
        Ok(())
@ -1939,7 +2094,7 @@ impl<'a> MethodEmitContext<'a> {
        self.emit_intrinsic(
            intrinsic,
            Some(arguments.dst),
-            &data.type_.into(),
+            Some(&data.type_.into()),
            vec![(
                self.resolver.value(arguments.src)?,
                get_scalar_type(self.context, data.type_),
@ -1956,7 +2111,7 @@ impl<'a> MethodEmitContext<'a> {
        self.emit_intrinsic(
            c"llvm.amdgcn.log.f32",
            Some(arguments.dst),
-            &ast::ScalarType::F32.into(),
+            Some(&ast::ScalarType::F32.into()),
            vec![(
                self.resolver.value(arguments.src)?,
                get_scalar_type(self.context, ast::ScalarType::F32.into()),
@ -2011,7 +2166,7 @@ impl<'a> MethodEmitContext<'a> {
        self.emit_intrinsic(
            intrinsic,
            Some(arguments.dst),
-            &type_.into(),
+            Some(&type_.into()),
            vec![(self.resolver.value(arguments.src)?, llvm_type)],
        )?;
        Ok(())
@ -2026,7 +2181,7 @@ impl<'a> MethodEmitContext<'a> {
            ptx_parser::MinMaxDetails::Signed(..) => "llvm.smin",
            ptx_parser::MinMaxDetails::Unsigned(..) => "llvm.umin",
            ptx_parser::MinMaxDetails::Float(ptx_parser::MinMaxFloat { nan: true, .. }) => {
-                return Err(error_todo())
+                "llvm.minimum"
            }
            ptx_parser::MinMaxDetails::Float(ptx_parser::MinMaxFloat { .. }) => "llvm.minnum",
        };
@ -2035,7 +2190,7 @@ impl<'a> MethodEmitContext<'a> {
        self.emit_intrinsic(
            unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
            Some(arguments.dst),
-            &data.type_().into(),
+            Some(&data.type_().into()),
            vec![
                (self.resolver.value(arguments.src1)?, llvm_type),
                (self.resolver.value(arguments.src2)?, llvm_type),
@ -2053,7 +2208,7 @@ impl<'a> MethodEmitContext<'a> {
            ptx_parser::MinMaxDetails::Signed(..) => "llvm.smax",
            ptx_parser::MinMaxDetails::Unsigned(..) => "llvm.umax",
            ptx_parser::MinMaxDetails::Float(ptx_parser::MinMaxFloat { nan: true, .. }) => {
-                return Err(error_todo())
+                "llvm.maximum"
            }
            ptx_parser::MinMaxDetails::Float(ptx_parser::MinMaxFloat { .. }) => "llvm.maxnum",
        };
@ -2062,7 +2217,7 @@ impl<'a> MethodEmitContext<'a> {
        self.emit_intrinsic(
            unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
            Some(arguments.dst),
-            &data.type_().into(),
+            Some(&data.type_().into()),
            vec![
                (self.resolver.value(arguments.src1)?, llvm_type),
                (self.resolver.value(arguments.src2)?, llvm_type),
@ -2080,7 +2235,7 @@ impl<'a> MethodEmitContext<'a> {
        self.emit_intrinsic(
            unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
            Some(arguments.dst),
-            &data.type_.into(),
+            Some(&data.type_.into()),
            vec![
                (
                    self.resolver.value(arguments.src1)?,
@ -2201,12 +2356,189 @@ impl<'a> MethodEmitContext<'a> {
        self.emit_intrinsic(
            unsafe { CStr::from_bytes_with_nul_unchecked(llvm_intrinsic.as_bytes()) },
            Some(arguments.dst),
-            &data.type_.into(),
+            Some(&data.type_.into()),
            intrinsic_arguments,
        )?;
        Ok(())
    }
    fn emit_mul24(
        &mut self,
        data: ast::Mul24Details,
        arguments: ast::Mul24Args<SpirvWord>,
    ) -> Result<(), TranslateError> {
        let src1 = self.resolver.value(arguments.src1)?;
        let src2 = self.resolver.value(arguments.src2)?;
        let name_lo = match data.type_ {
            ast::ScalarType::U32 => c"llvm.amdgcn.mul.u24",
            ast::ScalarType::S32 => c"llvm.amdgcn.mul.i24",
            _ => return Err(error_unreachable()),
        };
        let res_lo = self.emit_intrinsic(
            name_lo,
            if data.control == Mul24Control::Lo {
                Some(arguments.dst)
            } else {
                None
            },
            Some(&ast::Type::Scalar(data.type_)),
            vec![
                (src1, get_scalar_type(self.context, data.type_)),
                (src2, get_scalar_type(self.context, data.type_)),
            ],
        )?;
        if data.control == Mul24Control::Hi {
            // There is an important difference between NVIDIA's mul24.hi and AMD's mulhi.[ui]24.
            // NVIDIA: Returns bits 47..16 of the 64-bit result
            // AMD: Returns bits 63..32 of the 64-bit result
            // Hence we need to compute both hi and lo, shift the results and add them together to replicate NVIDIA's mul24
            let name_hi = match data.type_ {
                ast::ScalarType::U32 => c"llvm.amdgcn.mulhi.u24",
                ast::ScalarType::S32 => c"llvm.amdgcn.mulhi.i24",
                _ => return Err(error_unreachable()),
            };
            let res_hi = self.emit_intrinsic(
                name_hi,
                None,
                Some(&ast::Type::Scalar(data.type_)),
                vec![
                    (src1, get_scalar_type(self.context, data.type_)),
                    (src2, get_scalar_type(self.context, data.type_)),
                ],
            )?;
            let shift_number = unsafe { LLVMConstInt(LLVMInt32TypeInContext(self.context), 16, 0) };
            let res_lo_shr =
                unsafe { LLVMBuildLShr(self.builder, res_lo, shift_number, LLVM_UNNAMED.as_ptr()) };
            let res_hi_shl =
                unsafe { LLVMBuildShl(self.builder, res_hi, shift_number, LLVM_UNNAMED.as_ptr()) };
            self.resolver
                .with_result(arguments.dst, |dst: *const i8| unsafe {
                    LLVMBuildOr(self.builder, res_lo_shr, res_hi_shl, dst)
                });
        }
        Ok(())
    }
    fn emit_set_mode(&mut self, mode_reg: ModeRegister) -> Result<(), TranslateError> {
        fn hwreg(reg: u32, offset: u32, size: u32) -> u32 {
            reg | (offset << 6) | ((size - 1) << 11)
        }
        fn denormal_to_value(ftz: bool) -> u32 {
            if ftz {
                0
            } else {
                3
            }
        }
        fn rounding_to_value(ftz: ast::RoundingMode) -> u32 {
            match ftz {
                ptx_parser::RoundingMode::NearestEven => 0,
                ptx_parser::RoundingMode::Zero => 3,
                ptx_parser::RoundingMode::NegativeInf => 2,
                ptx_parser::RoundingMode::PositiveInf => 1,
            }
        }
        fn merge_regs(f32: u32, f16f64: u32) -> u32 {
            f32 | f16f64 << 2
        }
        let intrinsic = c"llvm.amdgcn.s.setreg";
        let (hwreg, value) = match mode_reg {
            ModeRegister::Denormal { f32, f16f64 } => {
                let hwreg = hwreg(1, 4, 4);
                let f32 = denormal_to_value(f32);
                let f16f64 = denormal_to_value(f16f64);
                let value = merge_regs(f32, f16f64);
                (hwreg, value)
            }
            ModeRegister::Rounding { f32, f16f64 } => {
                let hwreg = hwreg(1, 0, 4);
                let f32 = rounding_to_value(f32);
                let f16f64 = rounding_to_value(f16f64);
                let value = merge_regs(f32, f16f64);
                (hwreg, value)
            }
        };
        let llvm_i32 = get_scalar_type(self.context, ast::ScalarType::B32);
        let hwreg_llvm = unsafe { LLVMConstInt(llvm_i32, hwreg as _, 0) };
        let value_llvm = unsafe { LLVMConstInt(llvm_i32, value as _, 0) };
        self.emit_intrinsic(
            intrinsic,
            None,
            None,
            vec![(hwreg_llvm, llvm_i32), (value_llvm, llvm_i32)],
        )?;
        Ok(())
    }
    fn emit_fp_saturate(
        &mut self,
        type_: ast::ScalarType,
        dst: SpirvWord,
        src: SpirvWord,
    ) -> Result<(), TranslateError> {
        let llvm_type = get_scalar_type(self.context, type_);
        let zero = unsafe { LLVMConstReal(llvm_type, 0.0) };
        let one = unsafe { LLVMConstReal(llvm_type, 1.0) };
        let maxnum_intrinsic = format!("llvm.maxnum.{}\0", LLVMTypeDisplay(type_));
        let minnum_intrinsic = format!("llvm.minnum.{}\0", LLVMTypeDisplay(type_));
        let src = self.resolver.value(src)?;
        let maxnum = self.emit_intrinsic(
            unsafe { CStr::from_bytes_with_nul_unchecked(maxnum_intrinsic.as_bytes()) },
            None,
            Some(&type_.into()),
            vec![(src, llvm_type), (zero, llvm_type)],
        )?;
        self.emit_intrinsic(
            unsafe { CStr::from_bytes_with_nul_unchecked(minnum_intrinsic.as_bytes()) },
            Some(dst),
            Some(&type_.into()),
            vec![(maxnum, llvm_type), (one, llvm_type)],
        )?;
        Ok(())
    }
    fn emit_add_sat(
        &mut self,
        type_: ast::ScalarType,
        arguments: ast::AddArgs<SpirvWord>,
    ) -> Result<(), TranslateError> {
        let llvm_type = get_scalar_type(self.context, type_);
        let src1 = self.resolver.value(arguments.src1)?;
        let src2 = self.resolver.value(arguments.src2)?;
        let op = if type_.kind() == ast::ScalarKind::Signed {
            "sadd"
        } else {
            "uadd"
        };
        let intrinsic = format!("llvm.{}.sat.{}\0", op, LLVMTypeDisplay(type_));
        self.emit_intrinsic(
            unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
            Some(arguments.dst),
            Some(&type_.into()),
            vec![(src1, llvm_type), (src2, llvm_type)],
        )?;
        Ok(())
    }
    fn flush_denormals(
        &mut self,
        type_: ptx_parser::ScalarType,
        src: SpirvWord,
        dst: SpirvWord,
    ) -> Result<(), TranslateError> {
        let llvm_type = get_scalar_type(self.context, type_);
        let src = self.resolver.value(src)?;
        let intrinsic = format!("llvm.canonicalize.{}\0", LLVMTypeDisplay(type_));
        self.emit_intrinsic(
            unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
            Some(dst),
            Some(&type_.into()),
            vec![(src, llvm_type)],
        )?;
        Ok(())
    }
    /*
    // Currently unused, LLVM 18 (ROCm 6.2) does not support `llvm.set.rounding`
    // Should be available in LLVM 19
--- a/ptx/src/pass/expand_operands.rs
+++ b/ptx/src/pass/expand_operands.rs
@ -2,8 +2,8 @@ use super::*;
 pub(super) fn run<'a, 'input>(
    resolver: &mut GlobalStringIdentResolver2<'input>,
-    directives: Vec<UnconditionalDirective<'input>>,
+    directives: Vec<UnconditionalDirective>,
-) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
+) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
    directives
        .into_iter()
        .map(|directive| run_directive(resolver, directive))
@ -13,11 +13,10 @@ pub(super) fn run<'a, 'input>(
 fn run_directive<'input>(
    resolver: &mut GlobalStringIdentResolver2<'input>,
    directive: Directive2<
        'input,
        ast::Instruction<ast::ParsedOperand<SpirvWord>>,
        ast::ParsedOperand<SpirvWord>,
    >,
-) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
+) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
    Ok(match directive {
        Directive2::Variable(linking, var) => Directive2::Variable(linking, var),
        Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?),
@ -27,11 +26,10 @@ fn run_directive<'input>(
 fn run_method<'input>(
    resolver: &mut GlobalStringIdentResolver2<'input>,
    method: Function2<
        'input,
        ast::Instruction<ast::ParsedOperand<SpirvWord>>,
        ast::ParsedOperand<SpirvWord>,
    >,
-) -> Result<Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
+) -> Result<Function2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
    let body = method
        .body
        .map(|statements| {
@ -43,12 +41,18 @@ fn run_method<'input>(
        })
        .transpose()?;
    Ok(Function2 {
        func_decl: method.func_decl,
        globals: method.globals,
        body,
        return_arguments: method.return_arguments,
        name: method.name,
        input_arguments: method.input_arguments,
        import_as: method.import_as,
        tuning: method.tuning,
        linkage: method.linkage,
        is_kernel: method.is_kernel,
        flush_to_zero_f32: method.flush_to_zero_f32,
        flush_to_zero_f16f64: method.flush_to_zero_f16f64,
        rounding_mode_f32: method.rounding_mode_f32,
        rounding_mode_f16f64: method.rounding_mode_f16f64,
    })
 }
--- a/ptx/src/pass/fix_special_registers2.rs
+++ b/ptx/src/pass/fix_special_registers2.rs
@ -1,30 +1,33 @@
 use super::*;
 pub(super) fn run<'a, 'input>(
-    resolver: &mut GlobalStringIdentResolver2<'input>,
+    resolver: &'a mut GlobalStringIdentResolver2<'input>,
    special_registers: &'a SpecialRegistersMap2,
-    directives: Vec<UnconditionalDirective<'input>>,
+    directives: Vec<UnconditionalDirective>,
-) -> Result<Vec<UnconditionalDirective<'input>>, TranslateError> {
+) -> Result<Vec<UnconditionalDirective>, TranslateError> {
-    let declarations = SpecialRegistersMap2::generate_declarations(resolver);
+    let mut result = Vec::with_capacity(SpecialRegistersMap2::len() + directives.len());
    let mut result = Vec::with_capacity(declarations.len() + directives.len());
    let mut sreg_to_function =
-        FxHashMap::with_capacity_and_hasher(declarations.len(), Default::default());
+        FxHashMap::with_capacity_and_hasher(SpecialRegistersMap2::len(), Default::default());
-    for (sreg, declaration) in declarations {
+    SpecialRegistersMap2::foreach_declaration(
-        let name = if let ast::MethodName::Func(name) = declaration.name {
+        resolver,
-            name
+        |sreg, (return_arguments, name, input_arguments)| {
-        } else {
+            result.push(UnconditionalDirective::Method(UnconditionalFunction {
-            return Err(error_unreachable());
+                return_arguments,
-        };
+                name,
-        result.push(UnconditionalDirective::Method(UnconditionalFunction {
+                input_arguments,
-            func_decl: declaration,
+                body: None,
-            globals: Vec::new(),
+                import_as: None,
-            body: None,
+                tuning: Vec::new(),
-            import_as: None,
+                linkage: ast::LinkingDirective::EXTERN,
-            tuning: Vec::new(),
+                is_kernel: false,
-            linkage: ast::LinkingDirective::EXTERN,
+                flush_to_zero_f32: false,
-        }));
+                flush_to_zero_f16f64: false,
-        sreg_to_function.insert(sreg, name);
+                rounding_mode_f32: ptx_parser::RoundingMode::NearestEven,
-    }
+                rounding_mode_f16f64: ptx_parser::RoundingMode::NearestEven,
            }));
            sreg_to_function.insert(sreg, name);
        },
    );
    let mut visitor = SpecialRegisterResolver {
        resolver,
        special_registers,
@ -39,8 +42,8 @@ pub(super) fn run<'a, 'input>(
 fn run_directive<'a, 'input>(
    visitor: &mut SpecialRegisterResolver<'a, 'input>,
-    directive: UnconditionalDirective<'input>,
+    directive: UnconditionalDirective,
-) -> Result<UnconditionalDirective<'input>, TranslateError> {
+) -> Result<UnconditionalDirective, TranslateError> {
    Ok(match directive {
        var @ Directive2::Variable(..) => var,
        Directive2::Method(method) => Directive2::Method(run_method(visitor, method)?),
@ -49,8 +52,8 @@ fn run_directive<'a, 'input>(
 fn run_method<'a, 'input>(
    visitor: &mut SpecialRegisterResolver<'a, 'input>,
-    method: UnconditionalFunction<'input>,
+    method: UnconditionalFunction,
-) -> Result<UnconditionalFunction<'input>, TranslateError> {
+) -> Result<UnconditionalFunction, TranslateError> {
    let body = method
        .body
        .map(|statements| {
@ -61,14 +64,7 @@ fn run_method<'a, 'input>(
            Ok::<_, TranslateError>(result)
        })
        .transpose()?;
-    Ok(Function2 {
+    Ok(Function2 { body, ..method })
        func_decl: method.func_decl,
        globals: method.globals,
        body,
        import_as: method.import_as,
        tuning: method.tuning,
        linkage: method.linkage,
    })
 }
 fn run_statement<'a, 'input>(
--- a/ptx/src/pass/hoist_globals.rs
+++ b/ptx/src/pass/hoist_globals.rs
@ -1,8 +1,8 @@
 use super::*;
 pub(super) fn run<'input>(
-    directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
+    directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
-) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
+) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
    let mut result = Vec::with_capacity(directives.len());
    for mut directive in directives.into_iter() {
        run_directive(&mut result, &mut directive)?;
@ -12,8 +12,8 @@ pub(super) fn run<'input>(
 }
 fn run_directive<'input>(
-    result: &mut Vec<Directive2<'input, ptx_parser::Instruction<SpirvWord>, SpirvWord>>,
+    result: &mut Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
-    directive: &mut Directive2<'input, ptx_parser::Instruction<SpirvWord>, SpirvWord>,
+    directive: &mut Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
 ) -> Result<(), TranslateError> {
    match directive {
        Directive2::Variable(..) => {}
@ -23,8 +23,8 @@ fn run_directive<'input>(
 }
 fn run_function<'input>(
-    result: &mut Vec<Directive2<'input, ptx_parser::Instruction<SpirvWord>, SpirvWord>>,
+    result: &mut Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
-    function: &mut Function2<'input, ptx_parser::Instruction<SpirvWord>, SpirvWord>,
+    function: &mut Function2<ast::Instruction<SpirvWord>, SpirvWord>,
 ) {
    function.body = function.body.take().map(|statements| {
        statements
--- a/ptx/src/pass/insert_explicit_load_store.rs
+++ b/ptx/src/pass/insert_explicit_load_store.rs
@ -11,8 +11,8 @@ use super::*;
 //   pass, so we do nothing there
 pub(super) fn run<'a, 'input>(
    resolver: &mut GlobalStringIdentResolver2<'input>,
-    directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
+    directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
-) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
+) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
    directives
        .into_iter()
        .map(|directive| run_directive(resolver, directive))
@ -21,8 +21,8 @@ pub(super) fn run<'a, 'input>(
 fn run_directive<'a, 'input>(
    resolver: &mut GlobalStringIdentResolver2<'input>,
-    directive: Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
+    directive: Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
-) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
+) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
    Ok(match directive {
        var @ Directive2::Variable(..) => var,
        Directive2::Method(method) => {
@ -34,12 +34,11 @@ fn run_directive<'a, 'input>(
 fn run_method<'a, 'input>(
    mut visitor: InsertMemSSAVisitor<'a, 'input>,
-    method: Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
+    mut method: Function2<ast::Instruction<SpirvWord>, SpirvWord>,
-) -> Result<Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
+) -> Result<Function2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
-    let mut func_decl = method.func_decl;
+    let is_kernel = method.is_kernel;
    let is_kernel = func_decl.name.is_kernel();
    if is_kernel {
-        for arg in func_decl.input_arguments.iter_mut() {
+        for arg in method.input_arguments.iter_mut() {
            let old_name = arg.name;
            let old_space = arg.state_space;
            let new_space = ast::StateSpace::ParamEntry;
@ -51,10 +50,10 @@ fn run_method<'a, 'input>(
            arg.state_space = new_space;
        }
    };
-    for arg in func_decl.return_arguments.iter_mut() {
+    for arg in method.return_arguments.iter_mut() {
        visitor.visit_variable(arg)?;
    }
-    let return_arguments = &func_decl.return_arguments[..];
+    let return_arguments = &method.return_arguments[..];
    let body = method
        .body
        .map(move |statements| {
@ -65,14 +64,7 @@ fn run_method<'a, 'input>(
            Ok::<_, TranslateError>(result)
        })
        .transpose()?;
-    Ok(Function2 {
+    Ok(Function2 { body, ..method })
        func_decl: func_decl,
        globals: method.globals,
        body,
        import_as: method.import_as,
        tuning: method.tuning,
        linkage: method.linkage,
    })
 }
 fn run_statement<'a, 'input>(
--- a/ptx/src/pass/insert_implicit_conversions2.rs
+++ b/ptx/src/pass/insert_implicit_conversions2.rs
@ -19,8 +19,8 @@ use ptx_parser as ast;
 */
 pub(super) fn run<'input>(
    resolver: &mut GlobalStringIdentResolver2<'input>,
-    directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
+    directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
-) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
+) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
    directives
        .into_iter()
        .map(|directive| run_directive(resolver, directive))
@ -29,8 +29,8 @@ pub(super) fn run<'input>(
 fn run_directive<'a, 'input>(
    resolver: &mut GlobalStringIdentResolver2<'input>,
-    directive: Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
+    directive: Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
-) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
+) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
    Ok(match directive {
        var @ Directive2::Variable(..) => var,
        Directive2::Method(mut method) => {
--- a/ptx/src/pass/insert_post_saturation.rs
+++ b/ptx/src/pass/insert_post_saturation.rs
@ -0,0 +1,296 @@
 use super::*;
 use ptx_parser as ast;
 pub(super) fn run<'a, 'input>(
    resolver: &mut GlobalStringIdentResolver2<'input>,
    directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
 ) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
    directives
        .into_iter()
        .map(|directive| run_directive(resolver, directive))
        .collect::<Result<Vec<_>, _>>()
 }
 fn run_directive<'input>(
    resolver: &mut GlobalStringIdentResolver2,
    directive: Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
 ) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
    Ok(match directive {
        var @ Directive2::Variable(..) => var,
        Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?),
    })
 }
 fn run_method<'input>(
    resolver: &mut GlobalStringIdentResolver2,
    method: Function2<ast::Instruction<SpirvWord>, SpirvWord>,
 ) -> Result<Function2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
    let mut new_statements = Vec::new();
    let body = method
        .body
        .map(|statements| {
            for statement in statements {
                run_statement(resolver, &mut new_statements, statement)?;
            }
            Ok::<_, TranslateError>(new_statements)
        })
        .transpose()?;
    Ok(Function2 { body, ..method })
 }
 fn run_statement<'input>(
    resolver: &mut GlobalStringIdentResolver2<'input>,
    result: &mut Vec<Statement<ast::Instruction<SpirvWord>, SpirvWord>>,
    statement: Statement<ast::Instruction<SpirvWord>, SpirvWord>,
 ) -> Result<(), TranslateError> {
    match statement {
        Statement::Instruction(inst) => run_instruction(resolver, result, inst)?,
        statement => {
            result.push(statement);
        }
    }
    Ok(())
 }
 fn run_instruction<'input>(
    resolver: &mut GlobalStringIdentResolver2<'input>,
    result: &mut Vec<Statement<ast::Instruction<SpirvWord>, SpirvWord>>,
    mut instruction: ast::Instruction<SpirvWord>,
 ) -> Result<(), TranslateError> {
    match instruction {
        ast::Instruction::Abs { .. }
        | ast::Instruction::Activemask { .. }
        | ast::Instruction::Add {
            data:
                ast::ArithDetails::Float(ast::ArithFloat {
                    saturate: false, ..
                }),
            ..
        }
        | ast::Instruction::Add {
            data: ast::ArithDetails::Integer(..),
            ..
        }
        | ast::Instruction::And { .. }
        | ast::Instruction::Atom { .. }
        | ast::Instruction::AtomCas { .. }
        | ast::Instruction::Bar { .. }
        | ast::Instruction::Bfe { .. }
        | ast::Instruction::Bfi { .. }
        | ast::Instruction::Bra { .. }
        | ast::Instruction::Brev { .. }
        | ast::Instruction::Call { .. }
        | ast::Instruction::Clz { .. }
        | ast::Instruction::Cos { .. }
        | ast::Instruction::Cvt {
            data:
                ast::CvtDetails {
                    mode:
                        ast::CvtMode::ZeroExtend
                        | ast::CvtMode::SignExtend
                        | ast::CvtMode::Truncate
                        | ast::CvtMode::Bitcast
                        | ast::CvtMode::IntSaturateToSigned
                        | ast::CvtMode::IntSaturateToUnsigned
                        | ast::CvtMode::SignedFromFP { .. }
                        | ast::CvtMode::UnsignedFromFP { .. }
                        | ast::CvtMode::FPFromSigned {
                            saturate: false, ..
                        }
                        | ast::CvtMode::FPFromUnsigned {
                            saturate: false, ..
                        }
                        | ast::CvtMode::FPExtend {
                            saturate: false, ..
                        }
                        | ast::CvtMode::FPTruncate {
                            saturate: false, ..
                        }
                        | ast::CvtMode::FPRound {
                            saturate: false, ..
                        },
                    ..
                },
            ..
        }
        | ast::Instruction::Cvta { .. }
        | ast::Instruction::Div { .. }
        | ast::Instruction::Ex2 { .. }
        | ast::Instruction::Fma {
            data: ast::ArithFloat {
                saturate: false, ..
            },
            ..
        }
        | ast::Instruction::Ld { .. }
        | ast::Instruction::Lg2 { .. }
        | ast::Instruction::Mad {
            data:
                ast::MadDetails::Float(ast::ArithFloat {
                    saturate: false, ..
                }),
            ..
        }
        | ast::Instruction::Mad {
            data: ast::MadDetails::Integer { .. },
            ..
        }
        | ast::Instruction::Max { .. }
        | ast::Instruction::Membar { .. }
        | ast::Instruction::Min { .. }
        | ast::Instruction::Mov { .. }
        | ast::Instruction::Mul {
            data:
                ast::MulDetails::Float(ast::ArithFloat {
                    saturate: false, ..
                }),
            ..
        }
        | ast::Instruction::Mul {
            data: ast::MulDetails::Integer { .. },
            ..
        }
        | ast::Instruction::Mul24 { .. }
        | ast::Instruction::Neg { .. }
        | ast::Instruction::Not { .. }
        | ast::Instruction::Or { .. }
        | ast::Instruction::Popc { .. }
        | ast::Instruction::Prmt { .. }
        | ast::Instruction::PrmtSlow { .. }
        | ast::Instruction::Rcp { .. }
        | ast::Instruction::Rem { .. }
        | ast::Instruction::Ret { .. }
        | ast::Instruction::Rsqrt { .. }
        | ast::Instruction::Selp { .. }
        | ast::Instruction::Setp { .. }
        | ast::Instruction::SetpBool { .. }
        | ast::Instruction::Shl { .. }
        | ast::Instruction::Shr { .. }
        | ast::Instruction::Sin { .. }
        | ast::Instruction::Sqrt { .. }
        | ast::Instruction::St { .. }
        | ast::Instruction::Sub {
            data:
                ast::ArithDetails::Float(ast::ArithFloat {
                    saturate: false, ..
                }),
            ..
        }
        | ast::Instruction::Sub {
            data: ast::ArithDetails::Integer(..),
            ..
        }
        | ast::Instruction::Trap {}
        | ast::Instruction::Xor { .. } => result.push(Statement::Instruction(instruction)),
        ast::Instruction::Add {
            data:
                ast::ArithDetails::Float(ast::ArithFloat {
                    saturate: true,
                    type_,
                    ..
                }),
            arguments: ast::AddArgs { ref mut dst, .. },
        }
        | ast::Instruction::Fma {
            data:
                ast::ArithFloat {
                    saturate: true,
                    type_,
                    ..
                },
            arguments: ast::FmaArgs { ref mut dst, .. },
        }
        | ast::Instruction::Mad {
            data:
                ast::MadDetails::Float(ast::ArithFloat {
                    saturate: true,
                    type_,
                    ..
                }),
            arguments: ast::MadArgs { ref mut dst, .. },
        }
        | ast::Instruction::Mul {
            data:
                ast::MulDetails::Float(ast::ArithFloat {
                    saturate: true,
                    type_,
                    ..
                }),
            arguments: ast::MulArgs { ref mut dst, .. },
        }
        | ast::Instruction::Sub {
            data:
                ast::ArithDetails::Float(ast::ArithFloat {
                    saturate: true,
                    type_,
                    ..
                }),
            arguments: ast::SubArgs { ref mut dst, .. },
        }
        | ast::Instruction::Cvt {
            data:
                ast::CvtDetails {
                    to: type_,
                    mode: ast::CvtMode::FPExtend { saturate: true, .. },
                    ..
                },
            arguments: ast::CvtArgs { ref mut dst, .. },
        }
        | ast::Instruction::Cvt {
            data:
                ast::CvtDetails {
                    to: type_,
                    mode: ast::CvtMode::FPTruncate { saturate: true, .. },
                    ..
                },
            arguments: ast::CvtArgs { ref mut dst, .. },
        }
        | ast::Instruction::Cvt {
            data:
                ast::CvtDetails {
                    to: type_,
                    mode: ast::CvtMode::FPRound { saturate: true, .. },
                    ..
                },
            arguments: ast::CvtArgs { ref mut dst, .. },
        }
        | ast::Instruction::Cvt {
            data:
                ast::CvtDetails {
                    to: type_,
                    mode: ast::CvtMode::FPFromSigned { saturate: true, .. },
                    ..
                },
            arguments: ast::CvtArgs { ref mut dst, .. },
        }
        | ast::Instruction::Cvt {
            data:
                ast::CvtDetails {
                    to: type_,
                    mode: ast::CvtMode::FPFromUnsigned { saturate: true, .. },
                    ..
                },
            arguments: ast::CvtArgs { ref mut dst, .. },
        } => {
            let sat = get_post_saturation(resolver, type_, dst)?;
            result.push(Statement::Instruction(instruction));
            result.push(sat);
        }
    }
    Ok(())
 }
 fn get_post_saturation<'input>(
    resolver: &mut GlobalStringIdentResolver2<'input>,
    type_: ast::ScalarType,
    old_dst: &mut SpirvWord,
 ) -> Result<Statement<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
    let post_sat = resolver.register_unnamed(Some((type_.into(), ast::StateSpace::Reg)));
    let dst = *old_dst;
    *old_dst = post_sat;
    Ok(Statement::FpSaturate {
        dst,
        src: post_sat,
        type_,
    })
 }
--- a/ptx/src/pass/instruction_mode_to_global_mode/call_with_mode.ptx
+++ b/ptx/src/pass/instruction_mode_to_global_mode/call_with_mode.ptx
@ -0,0 +1,29 @@
 .version 6.5
 .target sm_50
 .address_size 64
 .func use_modes();
 .visible .entry kernel()
 {
 	.reg .f32 	    temp;
    add.rz.ftz.f32     temp, temp, temp;
    call use_modes;
    add.rp.ftz.f32     temp, temp, temp;
 	ret;
 }
 .func use_modes()
 {
 	.reg .f32   temp;
    .reg .pred  pred;
    @pred bra SET_RM;
    @!pred bra SET_RZ;
 SET_RM:
    add.rm.f32     temp, temp, temp;
    ret;
 SET_RZ:
    add.rz.f32     temp, temp, temp;
    ret;
 }
--- a/ptx/src/pass/instruction_mode_to_global_mode/fold_denormal.ptx
+++ b/ptx/src/pass/instruction_mode_to_global_mode/fold_denormal.ptx
@ -0,0 +1,15 @@
 .version 6.5
 .target sm_30
 .address_size 64
 .visible .entry add()
 {
 	.reg .f32 	    temp<3>;
    add.ftz.f16     temp2, temp1, temp0;
    add.ftz.f32     temp2, temp1, temp0;
    add.f16     temp2, temp1, temp0;
    add.f32     temp2, temp1, temp0;
 	ret;
 }
--- a/ptx/src/pass/instruction_mode_to_global_mode/mod.rs
+++ b/ptx/src/pass/instruction_mode_to_global_mode/mod.rs
--- a/ptx/src/pass/instruction_mode_to_global_mode/test.rs
+++ b/ptx/src/pass/instruction_mode_to_global_mode/test.rs
@ -0,0 +1,399 @@
 use super::*;
 use int_enum::IntEnum;
 use strum::EnumCount;
 #[repr(usize)]
 #[derive(IntEnum, Eq, PartialEq, Copy, Clone, Debug)]
 enum Bool {
    False = 0,
    True = 1,
 }
 fn ftz() -> InstructionModes {
    InstructionModes {
        denormal_f32: Some(DenormalMode::FlushToZero),
        denormal_f16f64: None,
        rounding_f32: None,
        rounding_f16f64: None,
    }
 }
 fn preserve() -> InstructionModes {
    InstructionModes {
        denormal_f32: Some(DenormalMode::Preserve),
        denormal_f16f64: None,
        rounding_f32: None,
        rounding_f16f64: None,
    }
 }
 #[test]
 fn transitive_mixed() {
    let mut graph = ControlFlowGraph::new();
    let entry_id = SpirvWord(1);
    let false_id = SpirvWord(2);
    let empty_id = SpirvWord(3);
    let false2_id = SpirvWord(4);
    let entry = graph.add_entry_basic_block(entry_id);
    graph.add_jump(entry, false_id);
    let false_ = graph.get_or_add_basic_block(false_id);
    graph.set_modes(false_, ftz(), ftz());
    graph.add_jump(false_, empty_id);
    let empty = graph.get_or_add_basic_block(empty_id);
    graph.add_jump(empty, false2_id);
    let false2_ = graph.get_or_add_basic_block(false2_id);
    graph.set_modes(false2_, ftz(), ftz());
    let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32);
    assert_eq!(partial_result.bb_must_insert_mode.len(), 0);
    assert_eq!(partial_result.bb_maybe_insert_mode.len(), 1);
    assert_eq!(
        partial_result.bb_maybe_insert_mode[&false_id],
        (DenormalMode::FlushToZero, iter::once(entry_id).collect())
    );
    let result = optimize_mode_insertions::<DenormalMode, { DenormalMode::COUNT }>(partial_result);
    assert_eq!(result.basic_blocks.len(), 0);
    assert_eq!(result.kernels.len(), 1);
    assert_eq!(result.kernels[&entry_id], DenormalMode::FlushToZero);
 }
 #[test]
 fn transitive_change_twice() {
    let mut graph = ControlFlowGraph::new();
    let entry_id = SpirvWord(1);
    let false_id = SpirvWord(2);
    let empty_id = SpirvWord(3);
    let true_id = SpirvWord(4);
    let entry = graph.add_entry_basic_block(entry_id);
    graph.add_jump(entry, false_id);
    let false_ = graph.get_or_add_basic_block(false_id);
    graph.set_modes(false_, ftz(), ftz());
    graph.add_jump(false_, empty_id);
    let empty = graph.get_or_add_basic_block(empty_id);
    graph.add_jump(empty, true_id);
    let true_ = graph.get_or_add_basic_block(true_id);
    graph.set_modes(true_, preserve(), preserve());
    let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32);
    assert_eq!(partial_result.bb_must_insert_mode.len(), 1);
    assert!(partial_result.bb_must_insert_mode.contains(&true_id));
    assert_eq!(partial_result.bb_maybe_insert_mode.len(), 1);
    assert_eq!(
        partial_result.bb_maybe_insert_mode[&false_id],
        (DenormalMode::FlushToZero, iter::once(entry_id).collect())
    );
    let result = optimize_mode_insertions::<DenormalMode, { DenormalMode::COUNT }>(partial_result);
    assert_eq!(result.basic_blocks, iter::once(true_id).collect());
    assert_eq!(result.kernels.len(), 1);
    assert_eq!(result.kernels[&entry_id], DenormalMode::FlushToZero);
 }
 #[test]
 fn transitive_change() {
    let mut graph = ControlFlowGraph::new();
    let entry_id = SpirvWord(1);
    let empty_id = SpirvWord(2);
    let true_id = SpirvWord(3);
    let entry = graph.add_entry_basic_block(entry_id);
    graph.add_jump(entry, empty_id);
    let empty = graph.get_or_add_basic_block(empty_id);
    graph.add_jump(empty, true_id);
    let true_ = graph.get_or_add_basic_block(true_id);
    graph.set_modes(true_, preserve(), preserve());
    let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32);
    assert_eq!(partial_result.bb_must_insert_mode.len(), 0);
    assert_eq!(partial_result.bb_maybe_insert_mode.len(), 1);
    assert_eq!(
        partial_result.bb_maybe_insert_mode[&true_id],
        (DenormalMode::Preserve, iter::once(entry_id).collect())
    );
    let result = optimize_mode_insertions::<DenormalMode, { DenormalMode::COUNT }>(partial_result);
    assert_eq!(result.basic_blocks.len(), 0);
    assert_eq!(result.kernels.len(), 1);
    assert_eq!(result.kernels[&entry_id], DenormalMode::Preserve);
 }
 #[test]
 fn codependency() {
    let mut graph = ControlFlowGraph::new();
    let entry_id = SpirvWord(1);
    let left_f_id = SpirvWord(2);
    let right_f_id = SpirvWord(3);
    let left_none_id = SpirvWord(4);
    let mid_none_id = SpirvWord(5);
    let right_none_id = SpirvWord(6);
    let entry = graph.add_entry_basic_block(entry_id);
    graph.add_jump(entry, left_f_id);
    graph.add_jump(entry, right_f_id);
    let left_f = graph.get_or_add_basic_block(left_f_id);
    graph.set_modes(left_f, ftz(), ftz());
    let right_f = graph.get_or_add_basic_block(right_f_id);
    graph.set_modes(right_f, ftz(), ftz());
    graph.add_jump(left_f, left_none_id);
    let left_none = graph.get_or_add_basic_block(left_none_id);
    graph.add_jump(right_f, right_none_id);
    let right_none = graph.get_or_add_basic_block(right_none_id);
    graph.add_jump(left_none, mid_none_id);
    graph.add_jump(right_none, mid_none_id);
    let mid_none = graph.get_or_add_basic_block(mid_none_id);
    graph.add_jump(mid_none, left_none_id);
    graph.add_jump(mid_none, right_none_id);
    //println!(
    //    "{:?}",
    //    petgraph::dot::Dot::with_config(&graph.graph, &[petgraph::dot::Config::EdgeNoLabel])
    //);
    let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32);
    assert_eq!(partial_result.bb_must_insert_mode.len(), 0);
    assert_eq!(partial_result.bb_maybe_insert_mode.len(), 2);
    assert_eq!(
        partial_result.bb_maybe_insert_mode[&left_f_id],
        (DenormalMode::FlushToZero, iter::once(entry_id).collect())
    );
    assert_eq!(
        partial_result.bb_maybe_insert_mode[&right_f_id],
        (DenormalMode::FlushToZero, iter::once(entry_id).collect())
    );
    let result = optimize_mode_insertions::<DenormalMode, { DenormalMode::COUNT }>(partial_result);
    assert_eq!(result.basic_blocks.len(), 0);
    assert_eq!(result.kernels.len(), 1);
    assert_eq!(result.kernels[&entry_id], DenormalMode::FlushToZero);
 }
 static FOLD_DENORMAL_PTX: &'static str = include_str!("fold_denormal.ptx");
 #[test]
 fn fold_denormal() {
    let method = compile_methods(FOLD_DENORMAL_PTX).pop().unwrap();
    assert_eq!(true, method.flush_to_zero_f32);
    assert_eq!(true, method.flush_to_zero_f16f64);
    let method_body = method.body.unwrap();
    assert!(matches!(
        &*method_body,
        [
            Statement::Label(..),
            Statement::Variable(..),
            Statement::Variable(..),
            Statement::Variable(..),
            Statement::Instruction(ast::Instruction::Add { .. }),
            Statement::Instruction(ast::Instruction::Add { .. }),
            Statement::SetMode(ModeRegister::Denormal {
                f32: false,
                f16f64: false
            }),
            Statement::Instruction(ast::Instruction::Add { .. }),
            Statement::Instruction(ast::Instruction::Add { .. }),
            Statement::Instruction(ast::Instruction::Ret { .. }),
        ]
    ));
 }
 fn compile_methods(ptx: &str) -> Vec<Function2<ast::Instruction<SpirvWord>, SpirvWord>> {
    use crate::pass::*;
    let module = ptx_parser::parse_module_checked(ptx).unwrap();
    let mut flat_resolver = GlobalStringIdentResolver2::new(SpirvWord(1));
    let mut scoped_resolver = ScopedResolver::new(&mut flat_resolver);
    let directives = normalize_identifiers2::run(&mut scoped_resolver, module.directives).unwrap();
    let directives = normalize_predicates2::run(&mut flat_resolver, directives).unwrap();
    let directives = expand_operands::run(&mut flat_resolver, directives).unwrap();
    let directives = normalize_basic_blocks::run(&mut flat_resolver, directives).unwrap();
    let directives = super::run(&mut flat_resolver, directives).unwrap();
    directives
        .into_iter()
        .filter_map(|s| match s {
            Directive2::Method(m) => Some(m),
            _ => None,
        })
        .collect::<Vec<_>>()
 }
 static CALL_WITH_MODE_PTX: &'static str = include_str!("call_with_mode.ptx");
 #[test]
 fn call_with_mode() {
    let methods = compile_methods(CALL_WITH_MODE_PTX);
    assert!(matches!(methods[0].body, None));
    let method_1 = methods[1].body.as_ref().unwrap();
    assert!(matches!(
        &**method_1,
        [
            Statement::Label(..),
            Statement::Variable(..),
            Statement::Instruction(ast::Instruction::Add { .. }),
            Statement::Instruction(ast::Instruction::Call { .. }),
            Statement::Instruction(ast::Instruction::Bra { .. }),
            Statement::Label(..),
            // Dual prelude
            Statement::SetMode(ModeRegister::Denormal {
                f32: true,
                f16f64: true
            }),
            Statement::SetMode(ModeRegister::Rounding {
                f32: ast::RoundingMode::PositiveInf,
                f16f64: ast::RoundingMode::NearestEven
            }),
            Statement::Instruction(ast::Instruction::Bra { .. }),
            // Denormal prelude
            Statement::Label(..),
            Statement::SetMode(ModeRegister::Denormal {
                f32: true,
                f16f64: true
            }),
            Statement::Instruction(ast::Instruction::Bra { .. }),
            // Rounding prelude
            Statement::Label(..),
            Statement::SetMode(ModeRegister::Rounding {
                f32: ast::RoundingMode::PositiveInf,
                f16f64: ast::RoundingMode::NearestEven
            }),
            Statement::Instruction(ast::Instruction::Bra { .. }),
            Statement::Label(..),
            Statement::Instruction(ast::Instruction::Add { .. }),
            Statement::Instruction(ast::Instruction::Ret { .. }),
        ]
    ));
    let [to_fn0] = calls(method_1);
    let [_, dual_prelude, _, _, add] = labels(method_1);
    let [post_call, post_prelude_dual, post_prelude_denormal, post_prelude_rounding] =
        branches(method_1);
    assert_eq!(methods[0].name, to_fn0);
    assert_eq!(post_call, dual_prelude);
    assert_eq!(post_prelude_dual, add);
    assert_eq!(post_prelude_denormal, add);
    assert_eq!(post_prelude_rounding, add);
    let method_2 = methods[2].body.as_ref().unwrap();
    assert!(matches!(
        &**method_2,
        [
            Statement::Label(..),
            Statement::Variable(..),
            Statement::Variable(..),
            Statement::Conditional(..),
            Statement::Label(..),
            Statement::Conditional(..),
            Statement::Label(..),
            Statement::Instruction(ast::Instruction::Bra { .. }),
            Statement::Label(..),
            // Dual prelude
            Statement::SetMode(ModeRegister::Denormal {
                f32: false,
                f16f64: true
            }),
            Statement::SetMode(ModeRegister::Rounding {
                f32: ast::RoundingMode::NegativeInf,
                f16f64: ast::RoundingMode::NearestEven
            }),
            Statement::Instruction(ast::Instruction::Bra { .. }),
            // Denormal prelude
            Statement::Label(..),
            Statement::SetMode(ModeRegister::Denormal {
                f32: false,
                f16f64: true
            }),
            Statement::Instruction(ast::Instruction::Bra { .. }),
            // Rounding prelude
            Statement::Label(..),
            Statement::SetMode(ModeRegister::Rounding {
                f32: ast::RoundingMode::NegativeInf,
                f16f64: ast::RoundingMode::NearestEven
            }),
            Statement::Instruction(ast::Instruction::Bra { .. }),
            Statement::Label(..),
            Statement::Instruction(ast::Instruction::Add { .. }),
            Statement::Instruction(ast::Instruction::Bra { .. }),
            Statement::Label(..),
            Statement::SetMode(ModeRegister::Denormal {
                f32: false,
                f16f64: true
            }),
            Statement::Instruction(ast::Instruction::Bra { .. }),
            Statement::Label(..),
            Statement::Instruction(ast::Instruction::Add { .. }),
            Statement::Instruction(ast::Instruction::Bra { .. }),
            Statement::Label(..),
            Statement::Instruction(ast::Instruction::Ret { .. }),
        ]
    ));
    let [(if_rm_true, if_rm_false), (if_rz_true, if_rz_false)] = conditionals(method_2);
    let [_, conditional2, post_conditional2, prelude_dual, _, _, add1, add2_set_denormal, add2, ret] =
        labels(method_2);
    let [post_conditional2_jump, post_prelude_dual, post_prelude_denormal, post_prelude_rounding, post_add1, post_add2_set_denormal, post_add2] =
        branches(method_2);
    assert_eq!(if_rm_true, prelude_dual);
    assert_eq!(if_rm_false, conditional2);
    assert_eq!(if_rz_true, post_conditional2);
    assert_eq!(if_rz_false, add2_set_denormal);
    assert_eq!(post_conditional2_jump, prelude_dual);
    assert_eq!(post_prelude_dual, add1);
    assert_eq!(post_prelude_denormal, add1);
    assert_eq!(post_prelude_rounding, add1);
    assert_eq!(post_add1, ret);
    assert_eq!(post_add2_set_denormal, add2);
    assert_eq!(post_add2, ret);
 }
 fn branches<const N: usize>(
    fn_: &Vec<Statement<ast::Instruction<SpirvWord>, SpirvWord>>,
 ) -> [SpirvWord; N] {
    fn_.iter()
        .filter_map(|s| match s {
            Statement::Instruction(ast::Instruction::Bra {
                arguments: ast::BraArgs { src },
            }) => Some(*src),
            _ => None,
        })
        .collect::<Vec<_>>()
        .try_into()
        .unwrap()
 }
 fn labels<const N: usize>(
    fn_: &Vec<Statement<ast::Instruction<SpirvWord>, SpirvWord>>,
 ) -> [SpirvWord; N] {
    fn_.iter()
        .filter_map(
            |s: &Statement<ptx_parser::Instruction<SpirvWord>, SpirvWord>| match s {
                Statement::Label(label) => Some(*label),
                _ => None,
            },
        )
        .collect::<Vec<_>>()
        .try_into()
        .unwrap()
 }
 fn calls<const N: usize>(
    fn_: &Vec<Statement<ast::Instruction<SpirvWord>, SpirvWord>>,
 ) -> [SpirvWord; N] {
    fn_.iter()
        .filter_map(|s| match s {
            Statement::Instruction(ast::Instruction::Call {
                arguments: ast::CallArgs { func, .. },
                ..
            }) => Some(*func),
            _ => None,
        })
        .collect::<Vec<_>>()
        .try_into()
        .unwrap()
 }
 fn conditionals<const N: usize>(
    fn_: &Vec<Statement<ast::Instruction<SpirvWord>, SpirvWord>>,
 ) -> [(SpirvWord, SpirvWord); N] {
    fn_.iter()
        .filter_map(|s| match s {
            Statement::Conditional(BrachCondition {
                if_true, if_false, ..
            }) => Some((*if_true, *if_false)),
            _ => None,
        })
        .collect::<Vec<_>>()
        .try_into()
        .unwrap()
 }
--- a/ptx/src/pass/mod.rs
+++ b/ptx/src/pass/mod.rs
@ -18,11 +18,15 @@ mod fix_special_registers2;
 mod hoist_globals;
 mod insert_explicit_load_store;
 mod insert_implicit_conversions2;
 mod insert_post_saturation;
 mod instruction_mode_to_global_mode;
 mod normalize_basic_blocks;
 mod normalize_identifiers2;
 mod normalize_predicates2;
 mod remove_unreachable_basic_blocks;
 mod replace_instructions_with_function_calls;
 mod resolve_function_pointers;
 mod replace_known_functions;
 mod resolve_function_pointers;
 static ZLUDA_PTX_IMPL: &'static [u8] = include_bytes!("../../lib/zluda_ptx_impl.bc");
 const ZLUDA_PTX_PREFIX: &'static str = "__zluda_ptx_impl_";
@ -43,12 +47,16 @@ pub fn to_llvm_module<'input>(ast: ast::Module<'input>) -> Result<Module, Transl
    let mut scoped_resolver = ScopedResolver::new(&mut flat_resolver);
    let sreg_map = SpecialRegistersMap2::new(&mut scoped_resolver)?;
    let directives = normalize_identifiers2::run(&mut scoped_resolver, ast.directives)?;
-    let directives = replace_known_functions::run(&flat_resolver, directives);
+    let directives = replace_known_functions::run(&mut flat_resolver, directives);
    let directives = normalize_predicates2::run(&mut flat_resolver, directives)?;
    let directives = resolve_function_pointers::run(directives)?;
-    let directives: Vec<Directive2<'_, ptx_parser::Instruction<ptx_parser::ParsedOperand<SpirvWord>>, ptx_parser::ParsedOperand<SpirvWord>>> = fix_special_registers2::run(&mut flat_resolver, &sreg_map, directives)?;
+    let directives = fix_special_registers2::run(&mut flat_resolver, &sreg_map, directives)?;
    let directives = expand_operands::run(&mut flat_resolver, directives)?;
    let directives = insert_post_saturation::run(&mut flat_resolver, directives)?;
    let directives = deparamize_functions::run(&mut flat_resolver, directives)?;
    let directives = normalize_basic_blocks::run(&mut flat_resolver, directives)?;
    let directives = remove_unreachable_basic_blocks::run(directives)?;
    let directives = instruction_mode_to_global_mode::run(&mut flat_resolver, directives)?;
    let directives = insert_explicit_load_store::run(&mut flat_resolver, directives)?;
    let directives = insert_implicit_conversions2::run(&mut flat_resolver, directives)?;
    let directives = replace_instructions_with_function_calls::run(&mut flat_resolver, directives)?;
@ -61,7 +69,7 @@ pub fn to_llvm_module<'input>(ast: ast::Module<'input>) -> Result<Module, Transl
 }
 pub struct Module {
-    pub llvm_ir: emit_llvm::MemoryBuffer,
+    pub llvm_ir: emit_llvm::Module,
    pub kernel_info: HashMap<String, KernelInfo>,
 }
@ -195,6 +203,25 @@ enum Statement<I, P: ast::Operand> {
    FunctionPointer(FunctionPointerDetails),
    VectorRead(VectorRead),
    VectorWrite(VectorWrite),
    SetMode(ModeRegister),
    FpSaturate {
        dst: SpirvWord,
        src: SpirvWord,
        type_: ast::ScalarType,
    },
 }
 #[derive(Eq, PartialEq, Clone, Copy)]
 #[cfg_attr(test, derive(Debug))]
 enum ModeRegister {
    Denormal {
        f32: bool,
        f16f64: bool,
    },
    Rounding {
        f32: ast::RoundingMode,
        f16f64: ast::RoundingMode,
    },
 }
 impl<T: ast::Operand<Ident = SpirvWord>> Statement<ast::Instruction<T>, T> {
@ -467,6 +494,22 @@ impl<T: ast::Operand<Ident = SpirvWord>> Statement<ast::Instruction<T>, T> {
                let src = visitor.visit_ident(src, None, false, false)?;
                Statement::FunctionPointer(FunctionPointerDetails { dst, src })
            }
            Statement::SetMode(mode_register) => Statement::SetMode(mode_register),
            Statement::FpSaturate { dst, src, type_ } => {
                let dst = visitor.visit_ident(
                    dst,
                    Some((&type_.into(), ast::StateSpace::Reg)),
                    true,
                    false,
                )?;
                let src = visitor.visit_ident(
                    src,
                    Some((&type_.into(), ast::StateSpace::Reg)),
                    false,
                    false,
                )?;
                Statement::FpSaturate { dst, src, type_ }
            }
        })
    }
 }
@ -525,7 +568,7 @@ struct FunctionPointerDetails {
    src: SpirvWord,
 }
-#[derive(Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+#[derive(Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Debug)]
 pub struct SpirvWord(u32);
 impl From<u32> for SpirvWord {
@ -557,22 +600,27 @@ type NormalizedStatement = Statement<
    ast::ParsedOperand<SpirvWord>,
 >;
-enum Directive2<'input, Instruction, Operand: ast::Operand> {
+enum Directive2<Instruction, Operand: ast::Operand> {
    Variable(ast::LinkingDirective, ast::Variable<SpirvWord>),
-    Method(Function2<'input, Instruction, Operand>),
+    Method(Function2<Instruction, Operand>),
 }
-struct Function2<'input, Instruction, Operand: ast::Operand> {
+struct Function2<Instruction, Operand: ast::Operand> {
-    pub func_decl: ast::MethodDeclaration<'input, SpirvWord>,
+    pub return_arguments: Vec<ast::Variable<Operand::Ident>>,
-    pub globals: Vec<ast::Variable<SpirvWord>>,
+    pub name: Operand::Ident,
    pub input_arguments: Vec<ast::Variable<Operand::Ident>>,
    pub body: Option<Vec<Statement<Instruction, Operand>>>,
    is_kernel: bool,
    import_as: Option<String>,
    tuning: Vec<ast::TuningDirective>,
    linkage: ast::LinkingDirective,
    flush_to_zero_f32: bool,
    flush_to_zero_f16f64: bool,
    rounding_mode_f32: ast::RoundingMode,
    rounding_mode_f16f64: ast::RoundingMode,
 }
-type NormalizedDirective2<'input> = Directive2<
+type NormalizedDirective2 = Directive2<
    'input,
    (
        Option<ast::PredAt<SpirvWord>>,
        ast::Instruction<ast::ParsedOperand<SpirvWord>>,
@ -580,8 +628,7 @@ type NormalizedDirective2<'input> = Directive2<
    ast::ParsedOperand<SpirvWord>,
 >;
-type NormalizedFunction2<'input> = Function2<
+type NormalizedFunction2 = Function2<
    'input,
    (
        Option<ast::PredAt<SpirvWord>>,
        ast::Instruction<ast::ParsedOperand<SpirvWord>>,
@ -589,17 +636,11 @@ type NormalizedFunction2<'input> = Function2<
    ast::ParsedOperand<SpirvWord>,
 >;
-type UnconditionalDirective<'input> = Directive2<
+type UnconditionalDirective =
-    'input,
+    Directive2<ast::Instruction<ast::ParsedOperand<SpirvWord>>, ast::ParsedOperand<SpirvWord>>;
    ast::Instruction<ast::ParsedOperand<SpirvWord>>,
    ast::ParsedOperand<SpirvWord>,
 >;
-type UnconditionalFunction<'input> = Function2<
+type UnconditionalFunction =
-    'input,
+    Function2<ast::Instruction<ast::ParsedOperand<SpirvWord>>, ast::ParsedOperand<SpirvWord>>;
    ast::Instruction<ast::ParsedOperand<SpirvWord>>,
    ast::ParsedOperand<SpirvWord>,
 >;
 struct GlobalStringIdentResolver2<'input> {
    pub(crate) current_id: SpirvWord,
@ -805,47 +846,45 @@ impl SpecialRegistersMap2 {
        self.id_to_reg.get(&id).copied()
    }
-    fn generate_declarations<'a, 'input>(
+    fn len() -> usize {
        PtxSpecialRegister::iter().len()
    }
    fn foreach_declaration<'a, 'input>(
        resolver: &'a mut GlobalStringIdentResolver2<'input>,
-    ) -> impl ExactSizeIterator<
+        mut fn_: impl FnMut(
        Item = (
            PtxSpecialRegister,
-            ast::MethodDeclaration<'input, SpirvWord>,
+            (
                Vec<ast::Variable<SpirvWord>>,
                SpirvWord,
                Vec<ast::Variable<SpirvWord>>,
            ),
        ),
-    > + 'a {
+    ) {
-        PtxSpecialRegister::iter().map(|sreg| {
+        for sreg in PtxSpecialRegister::iter() {
            let external_fn_name = [ZLUDA_PTX_PREFIX, sreg.get_unprefixed_function_name()].concat();
-            let name =
+            let name = resolver.register_named(Cow::Owned(external_fn_name), None);
                ast::MethodName::Func(resolver.register_named(Cow::Owned(external_fn_name), None));
            let return_type = sreg.get_function_return_type();
            let input_type = sreg.get_function_input_type();
-            (
+            let return_arguments = vec![ast::Variable {
-                sreg,
+                align: None,
-                ast::MethodDeclaration {
+                v_type: return_type.into(),
-                    return_arguments: vec![ast::Variable {
+                state_space: ast::StateSpace::Reg,
-                        align: None,
+                name: resolver.register_unnamed(Some((return_type.into(), ast::StateSpace::Reg))),
-                        v_type: return_type.into(),
+                array_init: Vec::new(),
-                        state_space: ast::StateSpace::Reg,
+            }];
-                        name: resolver
+            let input_arguments = input_type
-                            .register_unnamed(Some((return_type.into(), ast::StateSpace::Reg))),
+                .into_iter()
-                        array_init: Vec::new(),
+                .map(|type_| ast::Variable {
-                    }],
+                    align: None,
-                    name: name,
+                    v_type: type_.into(),
-                    input_arguments: input_type
+                    state_space: ast::StateSpace::Reg,
-                        .into_iter()
+                    name: resolver.register_unnamed(Some((type_.into(), ast::StateSpace::Reg))),
-                        .map(|type_| ast::Variable {
+                    array_init: Vec::new(),
-                            align: None,
+                })
-                            v_type: type_.into(),
+                .collect::<Vec<_>>();
-                            state_space: ast::StateSpace::Reg,
+            fn_(sreg, (return_arguments, name, input_arguments));
-                            name: resolver
+        }
                                .register_unnamed(Some((type_.into(), ast::StateSpace::Reg))),
                            array_init: Vec::new(),
                        })
                        .collect::<Vec<_>>(),
                    shared_mem: None,
                },
            )
        })
    }
 }
--- a/ptx/src/pass/normalize_basic_blocks.rs
+++ b/ptx/src/pass/normalize_basic_blocks.rs
@ -0,0 +1,134 @@
 use super::*;
 // This pass normalizes ptx modules in two ways that makes mode computation pass
 // and code emissions passes much simpler:
 // * Inserts label at the start of every function
 //   This makes control flow graph simpler in mode computation block: we can
 //   represent kernels as separate nodes with its own separate entry/exit mode
 // * Inserts label at the start of every basic block
 // * Insert explicit jumps before labels
 // * Non-.entry methods get a single `ret;` exit point - this is because mode computation
 //   logic requires it. Control flow graph constructed by mode computation
 //   models function calls as jumps into and then from another function.
 //   If this cfg allowed multiple return basic blocks then there would be cases
 //   where we want to insert mode setting instruction along the edge between
 //   `ret;` and bb in the caller. This is only possible if there's a single
 //   edge between from function `ret;` and caller
 pub(crate) fn run(
    flat_resolver: &mut GlobalStringIdentResolver2<'_>,
    mut directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
 ) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
    for directive in directives.iter_mut() {
        let (body_ref, is_kernel) = match directive {
            Directive2::Method(Function2 {
                body: Some(body), is_kernel, ..
            }) => (body, *is_kernel),
            _ => continue,
        };
        let body = std::mem::replace(body_ref, Vec::new());
        let mut result = Vec::with_capacity(body.len());
        let mut previous_instruction_was_terminator = TerminatorKind::Not;
        let mut body_iterator = body.into_iter();
        let mut return_statements = Vec::new();
        match body_iterator.next() {
            Some(Statement::Label(_)) => {}
            Some(statement) => {
                result.push(Statement::Label(flat_resolver.register_unnamed(None)));
                result.push(statement);
            }
            None => {}
        }
        for statement in body_iterator {
            match previous_instruction_was_terminator {
                TerminatorKind::Not => match statement {
                    Statement::Label(label) => {
                        result.push(Statement::Instruction(ast::Instruction::Bra {
                            arguments: ast::BraArgs { src: label },
                        }))
                    }
                    _ => {}
                },
                TerminatorKind::Real => {
                    if !matches!(statement, Statement::Label(..)) {
                        result.push(Statement::Label(flat_resolver.register_unnamed(None)));
                    }
                }
                TerminatorKind::Fake => match statement {
                    // If there's a label after a call just reuse it
                    Statement::Label(label) => {
                        result.push(Statement::Instruction(ast::Instruction::Bra {
                            arguments: ast::BraArgs { src: label },
                        }))
                    }
                    _ => {
                        let label = flat_resolver.register_unnamed(None);
                        result.push(Statement::Instruction(ast::Instruction::Bra {
                            arguments: ast::BraArgs { src: label },
                        }));
                        result.push(Statement::Label(label));
                    }
                },
            }
            match statement {
                Statement::RetValue(..) => {
                    return Err(error_unreachable());
                }
                Statement::Instruction(ast::Instruction::Ret { .. }) => {
                    if !is_kernel {
                        return_statements.push(result.len());
                    }
                }
                _ => {}
            }
            previous_instruction_was_terminator = is_block_terminator(&statement);
            result.push(statement);
        }
        convert_from_multiple_returns_to_single_return(
            flat_resolver,
            &mut result,
            return_statements,
        )?;
        *body_ref = result;
    }
    Ok(directives)
 }
 enum TerminatorKind {
    Not,
    Real,
    Fake,
 }
 fn convert_from_multiple_returns_to_single_return(
    flat_resolver: &mut GlobalStringIdentResolver2<'_>,
    result: &mut Vec<Statement<ptx_parser::Instruction<SpirvWord>, SpirvWord>>,
    return_statements: Vec<usize>,
 ) -> Result<(), TranslateError> {
    Ok(if return_statements.len() > 1 {
        let ret_bb = flat_resolver.register_unnamed(None);
        result.push(Statement::Label(ret_bb));
        result.push(Statement::Instruction(ast::Instruction::Ret {
            data: ast::RetData { uniform: false },
        }));
        for ret_index in return_statements {
            let statement = result.get_mut(ret_index).ok_or_else(error_unreachable)?;
            *statement = Statement::Instruction(ast::Instruction::Bra {
                arguments: ast::BraArgs { src: ret_bb },
            });
        }
    })
 }
 fn is_block_terminator(
    statement: &Statement<ast::Instruction<SpirvWord>, SpirvWord>,
 ) -> TerminatorKind {
    match statement {
        Statement::Conditional(..)
        | Statement::Instruction(ast::Instruction::Bra { .. })
        // Normally call is not a terminator, but we treat it as such because it
        // makes the "instruction modes to global modes" pass possible
        | Statement::Instruction(ast::Instruction::Ret { .. }) => TerminatorKind::Real,
        Statement::Instruction(ast::Instruction::Call { .. }) => TerminatorKind::Fake,
        _ => TerminatorKind::Not,
    }
 }
--- a/ptx/src/pass/normalize_identifiers2.rs
+++ b/ptx/src/pass/normalize_identifiers2.rs
@ -4,7 +4,7 @@ use ptx_parser as ast;
 pub(crate) fn run<'input, 'b>(
    resolver: &mut ScopedResolver<'input, 'b>,
    directives: Vec<ast::Directive<'input, ast::ParsedOperand<&'input str>>>,
-) -> Result<Vec<NormalizedDirective2<'input>>, TranslateError> {
+) -> Result<Vec<NormalizedDirective2>, TranslateError> {
    resolver.start_scope();
    let result = directives
        .into_iter()
@ -17,7 +17,7 @@ pub(crate) fn run<'input, 'b>(
 fn run_directive<'input, 'b>(
    resolver: &mut ScopedResolver<'input, 'b>,
    directive: ast::Directive<'input, ast::ParsedOperand<&'input str>>,
-) -> Result<NormalizedDirective2<'input>, TranslateError> {
+) -> Result<NormalizedDirective2, TranslateError> {
    Ok(match directive {
        ast::Directive::Variable(linking, var) => {
            NormalizedDirective2::Variable(linking, run_variable(resolver, var)?)
@ -32,15 +32,11 @@ fn run_method<'input, 'b>(
    resolver: &mut ScopedResolver<'input, 'b>,
    linkage: ast::LinkingDirective,
    method: ast::Function<'input, &'input str, ast::Statement<ast::ParsedOperand<&'input str>>>,
-) -> Result<NormalizedFunction2<'input>, TranslateError> {
+) -> Result<NormalizedFunction2, TranslateError> {
-    let name = match method.func_directive.name {
+    let is_kernel = method.func_directive.name.is_kernel();
-        ast::MethodName::Kernel(name) => ast::MethodName::Kernel(name),
+    let name = resolver.add_or_get_in_current_scope_untyped(method.func_directive.name.text())?;
        ast::MethodName::Func(text) => {
            ast::MethodName::Func(resolver.add_or_get_in_current_scope_untyped(text)?)
        }
    };
    resolver.start_scope();
-    let func_decl = run_function_decl(resolver, method.func_directive, name)?;
+    let (return_arguments, input_arguments) = run_function_decl(resolver, method.func_directive)?;
    let body = method
        .body
        .map(|statements| {
@ -51,20 +47,25 @@ fn run_method<'input, 'b>(
        .transpose()?;
    resolver.end_scope();
    Ok(Function2 {
-        func_decl,
+        return_arguments,
-        globals: Vec::new(),
+        name,
        input_arguments,
        body,
        import_as: None,
        tuning: method.tuning,
        linkage,
        is_kernel,
        tuning: method.tuning,
        flush_to_zero_f32: false,
        flush_to_zero_f16f64: false,
        rounding_mode_f32: ptx_parser::RoundingMode::NearestEven,
        rounding_mode_f16f64: ptx_parser::RoundingMode::NearestEven,
    })
 }
 fn run_function_decl<'input, 'b>(
    resolver: &mut ScopedResolver<'input, 'b>,
    func_directive: ast::MethodDeclaration<'input, &'input str>,
-    name: ast::MethodName<'input, SpirvWord>,
+) -> Result<(Vec<ast::Variable<SpirvWord>>, Vec<ast::Variable<SpirvWord>>), TranslateError> {
 ) -> Result<ast::MethodDeclaration<'input, SpirvWord>, TranslateError> {
    assert!(func_directive.shared_mem.is_none());
    let return_arguments = func_directive
        .return_arguments
@ -76,12 +77,7 @@ fn run_function_decl<'input, 'b>(
        .into_iter()
        .map(|var| run_variable(resolver, var))
        .collect::<Result<Vec<_>, _>>()?;
-    Ok(ast::MethodDeclaration {
+    Ok((return_arguments, input_arguments))
        return_arguments,
        name,
        input_arguments,
        shared_mem: None,
    })
 }
 fn run_variable<'input, 'b>(
--- a/ptx/src/pass/normalize_predicates2.rs
+++ b/ptx/src/pass/normalize_predicates2.rs
@ -3,8 +3,8 @@ use ptx_parser as ast;
 pub(crate) fn run<'input>(
    resolver: &mut GlobalStringIdentResolver2<'input>,
-    directives: Vec<NormalizedDirective2<'input>>,
+    directives: Vec<NormalizedDirective2>,
-) -> Result<Vec<UnconditionalDirective<'input>>, TranslateError> {
+) -> Result<Vec<UnconditionalDirective>, TranslateError> {
    directives
        .into_iter()
        .map(|directive| run_directive(resolver, directive))
@ -13,8 +13,8 @@ pub(crate) fn run<'input>(
 fn run_directive<'input>(
    resolver: &mut GlobalStringIdentResolver2<'input>,
-    directive: NormalizedDirective2<'input>,
+    directive: NormalizedDirective2,
-) -> Result<UnconditionalDirective<'input>, TranslateError> {
+) -> Result<UnconditionalDirective, TranslateError> {
    Ok(match directive {
        Directive2::Variable(linking, var) => Directive2::Variable(linking, var),
        Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?),
@ -23,8 +23,8 @@ fn run_directive<'input>(
 fn run_method<'input>(
    resolver: &mut GlobalStringIdentResolver2<'input>,
-    method: NormalizedFunction2<'input>,
+    method: NormalizedFunction2,
-) -> Result<UnconditionalFunction<'input>, TranslateError> {
+) -> Result<UnconditionalFunction, TranslateError> {
    let body = method
        .body
        .map(|statements| {
@ -36,12 +36,18 @@ fn run_method<'input>(
        })
        .transpose()?;
    Ok(Function2 {
        func_decl: method.func_decl,
        globals: method.globals,
        body,
        return_arguments: method.return_arguments,
        name: method.name,
        input_arguments: method.input_arguments,
        import_as: method.import_as,
        tuning: method.tuning,
        linkage: method.linkage,
        is_kernel: method.is_kernel,
        flush_to_zero_f32: method.flush_to_zero_f32,
        flush_to_zero_f16f64: method.flush_to_zero_f16f64,
        rounding_mode_f32: method.rounding_mode_f32,
        rounding_mode_f16f64: method.rounding_mode_f16f64,
    })
 }
--- a/ptx/src/pass/remove_unreachable_basic_blocks.rs
+++ b/ptx/src/pass/remove_unreachable_basic_blocks.rs
@ -0,0 +1,122 @@
 use super::*;
 use petgraph::{
    graph::NodeIndex,
    visit::{Bfs, VisitMap},
    Graph,
 };
 use rustc_hash::FxHashSet;
 pub(crate) fn run(
    mut directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
 ) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
    let mut reachable_funcs = FxHashSet::default();
    for directive in directives.iter_mut() {
        match directive {
            Directive2::Method(Function2 {
                body: Some(body), ..
            }) => {
                let old_body = std::mem::replace(body, Vec::new());
                let mut cfg = ControlFlowGraph::new();
                let mut old_body_iter = old_body.iter();
                let mut current_bb = match old_body_iter.next() {
                    Some(Statement::Label(label)) => cfg.add_or_get_node(*label),
                    _ => return Err(error_unreachable()),
                };
                let first_bb = current_bb;
                for statement in old_body_iter {
                    match statement {
                        Statement::Label(label) => {
                            current_bb = cfg.add_or_get_node(*label);
                        }
                        Statement::Conditional(branch) => {
                            cfg.add_branch(current_bb, branch.if_true);
                            cfg.add_branch(current_bb, branch.if_false);
                        }
                        Statement::Instruction(ast::Instruction::Bra {
                            arguments: ast::BraArgs { src },
                        }) => {
                            cfg.add_branch(current_bb, *src);
                        }
                        Statement::FunctionPointer(FunctionPointerDetails {
                            src: _func, ..
                        }) => {
                            return Err(error_todo());
                        }
                        Statement::Instruction(ast::Instruction::Call {
                            arguments: ast::CallArgs { func, .. },
                            ..
                        }) => {
                            reachable_funcs.insert(*func);
                        }
                        _ => {}
                    }
                }
                let mut bfs = Bfs::new(&cfg.graph, first_bb);
                while let Some(_) = bfs.next(&cfg.graph) {}
                let mut visited = true;
                *body = try_filter_to_vec(old_body.into_iter(), |statement| {
                    match statement {
                        Statement::Label(label) => {
                            visited = bfs
                                .discovered
                                .is_visited(cfg.nodes.get(label).ok_or_else(error_unreachable)?);
                        }
                        _ => {}
                    }
                    Ok(visited)
                })?;
            }
            _ => {}
        }
    }
    Ok(directives
        .into_iter()
        .filter(|directive| match directive {
            Directive2::Variable(..) => true,
            Directive2::Method(Function2 {
                name, is_kernel, ..
            }) => *is_kernel || reachable_funcs.contains(name),
        })
        .collect::<Vec<_>>())
 }
 fn try_filter_to_vec<T, E>(
    mut iter: impl ExactSizeIterator<Item = T>,
    mut filter: impl FnMut(&T) -> Result<bool, E>,
 ) -> Result<Vec<T>, E> {
    iter.try_fold(Vec::with_capacity(iter.len()), |mut vec, item| {
        match filter(&item) {
            Ok(true) => vec.push(item),
            Ok(false) => {}
            Err(err) => return Err(err),
        }
        Ok(vec)
    })
 }
 struct ControlFlowGraph {
    graph: Graph<SpirvWord, ()>,
    nodes: FxHashMap<SpirvWord, NodeIndex>,
 }
 impl ControlFlowGraph {
    fn new() -> Self {
        Self {
            graph: Graph::new(),
            nodes: FxHashMap::default(),
        }
    }
    fn add_or_get_node(&mut self, id: SpirvWord) -> NodeIndex {
        *self
            .nodes
            .entry(id)
            .or_insert_with(|| self.graph.add_node(id))
    }
    fn add_branch(&mut self, from: NodeIndex, to: SpirvWord) -> NodeIndex {
        let to = self.add_or_get_node(to);
        self.graph.add_edge(from, to, ());
        to
    }
 }
--- a/ptx/src/pass/replace_instructions_with_function_calls.rs
+++ b/ptx/src/pass/replace_instructions_with_function_calls.rs
@ -2,8 +2,8 @@ use super::*;
 pub(super) fn run<'input>(
    resolver: &mut GlobalStringIdentResolver2<'input>,
-    directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
+    directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
-) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
+) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
    let mut fn_declarations = FxHashMap::default();
    let remapped_directives = directives
        .into_iter()
@ -13,17 +13,18 @@ pub(super) fn run<'input>(
        .into_iter()
        .map(|(_, (return_arguments, name, input_arguments))| {
            Directive2::Method(Function2 {
-                func_decl: ast::MethodDeclaration {
+                return_arguments,
-                    return_arguments,
+                name: name,
-                    name: ast::MethodName::Func(name),
+                input_arguments,
                    input_arguments,
                    shared_mem: None,
                },
                globals: Vec::new(),
                body: None,
                import_as: None,
                tuning: Vec::new(),
                linkage: ast::LinkingDirective::EXTERN,
                is_kernel: false,
                flush_to_zero_f32: false,
                flush_to_zero_f16f64: false,
                rounding_mode_f32: ptx_parser::RoundingMode::NearestEven,
                rounding_mode_f16f64: ptx_parser::RoundingMode::NearestEven,
            })
        })
        .collect::<Vec<_>>();
@ -41,8 +42,8 @@ fn run_directive<'input>(
            Vec<ast::Variable<SpirvWord>>,
        ),
    >,
-    directive: Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
+    directive: Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
-) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
+) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
    Ok(match directive {
        var @ Directive2::Variable(..) => var,
        Directive2::Method(mut method) => {
--- a/ptx/src/pass/replace_known_functions.rs
+++ b/ptx/src/pass/replace_known_functions.rs
@ -1,14 +1,15 @@
 use std::borrow::Cow;
 use super::{GlobalStringIdentResolver2, NormalizedDirective2, SpirvWord};
 pub(crate) fn run<'input>(
-    resolver: &GlobalStringIdentResolver2<'input>,
+    resolver: &mut GlobalStringIdentResolver2<'input>,
-    mut directives: Vec<NormalizedDirective2<'input>>,
+    mut directives: Vec<NormalizedDirective2>,
-) -> Vec<NormalizedDirective2<'input>> {
+) -> Vec<NormalizedDirective2> {
    for directive in directives.iter_mut() {
        match directive {
            NormalizedDirective2::Method(func) => {
-                func.import_as =
+                replace_with_ptx_impl(resolver, func.name);
                    replace_with_ptx_impl(resolver, &func.func_decl.name, func.import_as.take());
            }
            _ => {}
        }
@ -17,22 +18,16 @@ pub(crate) fn run<'input>(
 }
 fn replace_with_ptx_impl<'input>(
-    resolver: &GlobalStringIdentResolver2<'input>,
+    resolver: &mut GlobalStringIdentResolver2<'input>,
-    fn_name: &ptx_parser::MethodName<'input, SpirvWord>,
+    fn_name: SpirvWord,
-    name: Option<String>,
+) {
 ) -> Option<String> {
    let known_names = ["__assertfail"];
-    match name {
+    if let Some(super::IdentEntry {
-        Some(name) if known_names.contains(&&*name) => Some(format!("__zluda_ptx_impl_{}", name)),
+        name: Some(name), ..
-        Some(name) => Some(name),
+    }) = resolver.ident_map.get_mut(&fn_name)
-        None => match fn_name {
+    {
-            ptx_parser::MethodName::Func(name) => match resolver.ident_map.get(name) {
+        if known_names.contains(&&**name) {
-                Some(super::IdentEntry {
+            *name = Cow::Owned(format!("__zluda_ptx_impl_{}", name));
-                    name: Some(name), ..
+        }
                }) => Some(format!("__zluda_ptx_impl_{}", name)),
                _ => None,
            },
            ptx_parser::MethodName::Kernel(..) => None,
        },
    }
 }
--- a/ptx/src/pass/resolve_function_pointers.rs
+++ b/ptx/src/pass/resolve_function_pointers.rs
@ -3,8 +3,8 @@ use ptx_parser as ast;
 use rustc_hash::FxHashSet;
 pub(crate) fn run<'input>(
-    directives: Vec<UnconditionalDirective<'input>>,
+    directives: Vec<UnconditionalDirective>,
-) -> Result<Vec<UnconditionalDirective<'input>>, TranslateError> {
+) -> Result<Vec<UnconditionalDirective>, TranslateError> {
    let mut functions = FxHashSet::default();
    directives
        .into_iter()
@ -14,19 +14,13 @@ pub(crate) fn run<'input>(
 fn run_directive<'input>(
    functions: &mut FxHashSet<SpirvWord>,
-    directive: UnconditionalDirective<'input>,
+    directive: UnconditionalDirective,
-) -> Result<UnconditionalDirective<'input>, TranslateError> {
+) -> Result<UnconditionalDirective, TranslateError> {
    Ok(match directive {
        var @ Directive2::Variable(..) => var,
        Directive2::Method(method) => {
-            {
+            if !method.is_kernel {
-                let func_decl = &method.func_decl;
+                functions.insert(method.name);
                match func_decl.name {
                    ptx_parser::MethodName::Kernel(_) => {}
                    ptx_parser::MethodName::Func(name) => {
                        functions.insert(name);
                    }
                }
            }
            Directive2::Method(run_method(functions, method)?)
        }
@ -35,8 +29,8 @@ fn run_directive<'input>(
 fn run_method<'input>(
    functions: &mut FxHashSet<SpirvWord>,
-    method: UnconditionalFunction<'input>,
+    method: UnconditionalFunction,
-) -> Result<UnconditionalFunction<'input>, TranslateError> {
+) -> Result<UnconditionalFunction, TranslateError> {
    let body = method
        .body
        .map(|statements| {
@ -46,14 +40,7 @@ fn run_method<'input>(
                .collect::<Result<Vec<_>, _>>()
        })
        .transpose()?;
-    Ok(Function2 {
+    Ok(Function2 { body, ..method })
        func_decl: method.func_decl,
        globals: method.globals,
        body,
        import_as: method.import_as,
        tuning: method.tuning,
        linkage: method.linkage,
    })
 }
 fn run_statement<'input>(
--- a/ptx/src/test/ll/activemask.ll
+++ b/ptx/src/test/ll/activemask.ll
@ -0,0 +1,24 @@
 declare i32 @__zluda_ptx_impl_activemask() #0
 define amdgpu_kernel void @activemask(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #1 {
  %"31" = alloca i64, align 8, addrspace(5)
  %"32" = alloca i32, align 4, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"28"
 "28":                                             ; preds = %1
  %"33" = load i64, ptr addrspace(4) %"30", align 4
  store i64 %"33", ptr addrspace(5) %"31", align 4
  %"34" = call i32 @__zluda_ptx_impl_activemask()
  store i32 %"34", ptr addrspace(5) %"32", align 4
  %"35" = load i64, ptr addrspace(5) %"31", align 4
  %"36" = load i32, ptr addrspace(5) %"32", align 4
  %"37" = inttoptr i64 %"35" to ptr
  store i32 %"36", ptr %"37", align 4
  ret void
 }
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
 attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
--- a/ptx/src/test/ll/add.ll
+++ b/ptx/src/test/ll/add.ll
@ -0,0 +1,30 @@
 define amdgpu_kernel void @add(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
  %"34" = alloca i64, align 8, addrspace(5)
  %"35" = alloca i64, align 8, addrspace(5)
  %"36" = alloca i64, align 8, addrspace(5)
  %"37" = alloca i64, align 8, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"31"
 "31":                                             ; preds = %1
  %"38" = load i64, ptr addrspace(4) %"32", align 4
  store i64 %"38", ptr addrspace(5) %"34", align 4
  %"39" = load i64, ptr addrspace(4) %"33", align 4
  store i64 %"39", ptr addrspace(5) %"35", align 4
  %"41" = load i64, ptr addrspace(5) %"34", align 4
  %"46" = inttoptr i64 %"41" to ptr
  %"40" = load i64, ptr %"46", align 4
  store i64 %"40", ptr addrspace(5) %"36", align 4
  %"43" = load i64, ptr addrspace(5) %"36", align 4
  %"42" = add i64 %"43", 1
  store i64 %"42", ptr addrspace(5) %"37", align 4
  %"44" = load i64, ptr addrspace(5) %"35", align 4
  %"45" = load i64, ptr addrspace(5) %"37", align 4
  %"47" = inttoptr i64 %"44" to ptr
  store i64 %"45", ptr %"47", align 4
  ret void
 }
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
--- a/ptx/src/test/ll/add_ftz.ll
+++ b/ptx/src/test/ll/add_ftz.ll
@ -0,0 +1,52 @@
 define amdgpu_kernel void @add_ftz(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
  %"39" = alloca i64, align 8, addrspace(5)
  %"40" = alloca i64, align 8, addrspace(5)
  %"41" = alloca float, align 4, addrspace(5)
  %"42" = alloca float, align 4, addrspace(5)
  %"43" = alloca float, align 4, addrspace(5)
  %"44" = alloca float, align 4, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"36"
 "36":                                             ; preds = %1
  %"45" = load i64, ptr addrspace(4) %"37", align 4
  store i64 %"45", ptr addrspace(5) %"39", align 4
  %"46" = load i64, ptr addrspace(4) %"38", align 4
  store i64 %"46", ptr addrspace(5) %"40", align 4
  %"48" = load i64, ptr addrspace(5) %"39", align 4
  %"61" = inttoptr i64 %"48" to ptr
  %"47" = load float, ptr %"61", align 4
  store float %"47", ptr addrspace(5) %"41", align 4
  %"49" = load i64, ptr addrspace(5) %"39", align 4
  %"62" = inttoptr i64 %"49" to ptr
  %"33" = getelementptr inbounds i8, ptr %"62", i64 4
  %"50" = load float, ptr %"33", align 4
  store float %"50", ptr addrspace(5) %"42", align 4
  %"52" = load float, ptr addrspace(5) %"41", align 4
  %"53" = load float, ptr addrspace(5) %"42", align 4
  %"51" = fadd float %"52", %"53"
  store float %"51", ptr addrspace(5) %"43", align 4
  call void @llvm.amdgcn.s.setreg(i32 6401, i32 3)
  %"55" = load float, ptr addrspace(5) %"41", align 4
  %"56" = load float, ptr addrspace(5) %"42", align 4
  %"54" = fadd float %"55", %"56"
  store float %"54", ptr addrspace(5) %"44", align 4
  %"57" = load i64, ptr addrspace(5) %"40", align 4
  %"58" = load float, ptr addrspace(5) %"43", align 4
  %"63" = inttoptr i64 %"57" to ptr
  store float %"58", ptr %"63", align 4
  %"59" = load i64, ptr addrspace(5) %"40", align 4
  %"64" = inttoptr i64 %"59" to ptr
  %"35" = getelementptr inbounds i8, ptr %"64", i64 4
  %"60" = load float, ptr addrspace(5) %"44", align 4
  store float %"60", ptr %"35", align 4
  ret void
 }
 ; Function Attrs: nocallback nofree nosync nounwind willreturn
 declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #1
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
 attributes #1 = { nocallback nofree nosync nounwind willreturn }
--- a/ptx/src/test/ll/add_non_coherent.ll
+++ b/ptx/src/test/ll/add_non_coherent.ll
@ -0,0 +1,30 @@
 define amdgpu_kernel void @add_non_coherent(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
  %"34" = alloca i64, align 8, addrspace(5)
  %"35" = alloca i64, align 8, addrspace(5)
  %"36" = alloca i64, align 8, addrspace(5)
  %"37" = alloca i64, align 8, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"31"
 "31":                                             ; preds = %1
  %"38" = load i64, ptr addrspace(4) %"32", align 4
  store i64 %"38", ptr addrspace(5) %"34", align 4
  %"39" = load i64, ptr addrspace(4) %"33", align 4
  store i64 %"39", ptr addrspace(5) %"35", align 4
  %"41" = load i64, ptr addrspace(5) %"34", align 4
  %"46" = inttoptr i64 %"41" to ptr addrspace(1)
  %"40" = load i64, ptr addrspace(1) %"46", align 4
  store i64 %"40", ptr addrspace(5) %"36", align 4
  %"43" = load i64, ptr addrspace(5) %"36", align 4
  %"42" = add i64 %"43", 1
  store i64 %"42", ptr addrspace(5) %"37", align 4
  %"44" = load i64, ptr addrspace(5) %"35", align 4
  %"45" = load i64, ptr addrspace(5) %"37", align 4
  %"47" = inttoptr i64 %"44" to ptr addrspace(1)
  store i64 %"45", ptr addrspace(1) %"47", align 4
  ret void
 }
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
--- a/ptx/src/test/ll/add_s32_sat.ll
+++ b/ptx/src/test/ll/add_s32_sat.ll
@ -0,0 +1,51 @@
 define amdgpu_kernel void @add_s32_sat(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
  %"39" = alloca i64, align 8, addrspace(5)
  %"40" = alloca i64, align 8, addrspace(5)
  %"41" = alloca i32, align 4, addrspace(5)
  %"42" = alloca i32, align 4, addrspace(5)
  %"43" = alloca i32, align 4, addrspace(5)
  %"44" = alloca i32, align 4, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"36"
 "36":                                             ; preds = %1
  %"45" = load i64, ptr addrspace(4) %"37", align 4
  store i64 %"45", ptr addrspace(5) %"39", align 4
  %"46" = load i64, ptr addrspace(4) %"38", align 4
  store i64 %"46", ptr addrspace(5) %"40", align 4
  %"48" = load i64, ptr addrspace(5) %"39", align 4
  %"61" = inttoptr i64 %"48" to ptr
  %"47" = load i32, ptr %"61", align 4
  store i32 %"47", ptr addrspace(5) %"41", align 4
  %"49" = load i64, ptr addrspace(5) %"39", align 4
  %"62" = inttoptr i64 %"49" to ptr
  %"33" = getelementptr inbounds i8, ptr %"62", i64 4
  %"50" = load i32, ptr %"33", align 4
  store i32 %"50", ptr addrspace(5) %"42", align 4
  %"52" = load i32, ptr addrspace(5) %"41", align 4
  %"53" = load i32, ptr addrspace(5) %"42", align 4
  %"51" = call i32 @llvm.sadd.sat.i32(i32 %"52", i32 %"53")
  store i32 %"51", ptr addrspace(5) %"43", align 4
  %"55" = load i32, ptr addrspace(5) %"41", align 4
  %"56" = load i32, ptr addrspace(5) %"42", align 4
  %"54" = add i32 %"55", %"56"
  store i32 %"54", ptr addrspace(5) %"44", align 4
  %"57" = load i64, ptr addrspace(5) %"40", align 4
  %"58" = load i32, ptr addrspace(5) %"43", align 4
  %"63" = inttoptr i64 %"57" to ptr
  store i32 %"58", ptr %"63", align 4
  %"59" = load i64, ptr addrspace(5) %"40", align 4
  %"64" = inttoptr i64 %"59" to ptr
  %"35" = getelementptr inbounds i8, ptr %"64", i64 4
  %"60" = load i32, ptr addrspace(5) %"44", align 4
  store i32 %"60", ptr %"35", align 4
  ret void
 }
 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
 declare i32 @llvm.sadd.sat.i32(i32, i32) #1
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
 attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
--- a/ptx/src/test/ll/add_tuning.ll
+++ b/ptx/src/test/ll/add_tuning.ll
@ -0,0 +1,30 @@
 define amdgpu_kernel void @add_tuning(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
  %"34" = alloca i64, align 8, addrspace(5)
  %"35" = alloca i64, align 8, addrspace(5)
  %"36" = alloca i64, align 8, addrspace(5)
  %"37" = alloca i64, align 8, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"31"
 "31":                                             ; preds = %1
  %"38" = load i64, ptr addrspace(4) %"32", align 4
  store i64 %"38", ptr addrspace(5) %"34", align 4
  %"39" = load i64, ptr addrspace(4) %"33", align 4
  store i64 %"39", ptr addrspace(5) %"35", align 4
  %"41" = load i64, ptr addrspace(5) %"34", align 4
  %"46" = inttoptr i64 %"41" to ptr
  %"40" = load i64, ptr %"46", align 4
  store i64 %"40", ptr addrspace(5) %"36", align 4
  %"43" = load i64, ptr addrspace(5) %"36", align 4
  %"42" = add i64 %"43", 1
  store i64 %"42", ptr addrspace(5) %"37", align 4
  %"44" = load i64, ptr addrspace(5) %"35", align 4
  %"45" = load i64, ptr addrspace(5) %"37", align 4
  %"47" = inttoptr i64 %"44" to ptr
  store i64 %"45", ptr %"47", align 4
  ret void
 }
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
--- a/ptx/src/test/ll/and.ll
+++ b/ptx/src/test/ll/and.ll
@ -0,0 +1,36 @@
 define amdgpu_kernel void @and(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
  %"35" = alloca i64, align 8, addrspace(5)
  %"36" = alloca i64, align 8, addrspace(5)
  %"37" = alloca i32, align 4, addrspace(5)
  %"38" = alloca i32, align 4, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"32"
 "32":                                             ; preds = %1
  %"39" = load i64, ptr addrspace(4) %"33", align 4
  store i64 %"39", ptr addrspace(5) %"35", align 4
  %"40" = load i64, ptr addrspace(4) %"34", align 4
  store i64 %"40", ptr addrspace(5) %"36", align 4
  %"42" = load i64, ptr addrspace(5) %"35", align 4
  %"50" = inttoptr i64 %"42" to ptr
  %"41" = load i32, ptr %"50", align 4
  store i32 %"41", ptr addrspace(5) %"37", align 4
  %"43" = load i64, ptr addrspace(5) %"35", align 4
  %"51" = inttoptr i64 %"43" to ptr
  %"31" = getelementptr inbounds i8, ptr %"51", i64 4
  %"44" = load i32, ptr %"31", align 4
  store i32 %"44", ptr addrspace(5) %"38", align 4
  %"46" = load i32, ptr addrspace(5) %"37", align 4
  %"47" = load i32, ptr addrspace(5) %"38", align 4
  %"52" = and i32 %"46", %"47"
  store i32 %"52", ptr addrspace(5) %"37", align 4
  %"48" = load i64, ptr addrspace(5) %"36", align 4
  %"49" = load i32, ptr addrspace(5) %"37", align 4
  %"55" = inttoptr i64 %"48" to ptr
  store i32 %"49", ptr %"55", align 4
  ret void
 }
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
--- a/ptx/src/test/ll/assertfail.ll
+++ b/ptx/src/test/ll/assertfail.ll
@ -0,0 +1,64 @@
 declare void @__zluda_ptx_impl___assertfail(i64, i64, i32, i64, i64) #0
 define amdgpu_kernel void @assertfail(ptr addrspace(4) byref(i64) %"86", ptr addrspace(4) byref(i64) %"87") #1 {
  %"88" = alloca i64, align 8, addrspace(5)
  %"89" = alloca i64, align 8, addrspace(5)
  %"90" = alloca i64, align 8, addrspace(5)
  %"91" = alloca i64, align 8, addrspace(5)
  %"94" = alloca i32, align 4, addrspace(5)
  %"96" = alloca i64, align 8, addrspace(5)
  %"99" = alloca i64, align 8, addrspace(5)
  %"102" = alloca i32, align 4, addrspace(5)
  %"105" = alloca i64, align 8, addrspace(5)
  %"108" = alloca i64, align 8, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"84"
 "84":                                             ; preds = %1
  %"92" = load i64, ptr addrspace(4) %"86", align 4
  store i64 %"92", ptr addrspace(5) %"88", align 4
  %"93" = load i64, ptr addrspace(4) %"87", align 4
  store i64 %"93", ptr addrspace(5) %"89", align 4
  store i32 0, ptr addrspace(5) %"94", align 4
  %"97" = getelementptr inbounds i8, ptr addrspace(5) %"96", i64 0
  %"98" = load i64, ptr addrspace(5) %"88", align 4
  store i64 %"98", ptr addrspace(5) %"97", align 4
  %"100" = getelementptr inbounds i8, ptr addrspace(5) %"99", i64 0
  %"101" = load i64, ptr addrspace(5) %"88", align 4
  store i64 %"101", ptr addrspace(5) %"100", align 4
  %"103" = getelementptr inbounds i8, ptr addrspace(5) %"102", i64 0
  %"104" = load i32, ptr addrspace(5) %"94", align 4
  store i32 %"104", ptr addrspace(5) %"103", align 4
  %"106" = getelementptr inbounds i8, ptr addrspace(5) %"105", i64 0
  %"107" = load i64, ptr addrspace(5) %"88", align 4
  store i64 %"107", ptr addrspace(5) %"106", align 4
  %"109" = getelementptr inbounds i8, ptr addrspace(5) %"108", i64 0
  %"110" = load i64, ptr addrspace(5) %"88", align 4
  store i64 %"110", ptr addrspace(5) %"109", align 4
  %"74" = load i64, ptr addrspace(5) %"96", align 4
  %"75" = load i64, ptr addrspace(5) %"99", align 4
  %"76" = load i32, ptr addrspace(5) %"102", align 4
  %"77" = load i64, ptr addrspace(5) %"105", align 4
  %"78" = load i64, ptr addrspace(5) %"108", align 4
  call void @__zluda_ptx_impl___assertfail(i64 %"74", i64 %"75", i32 %"76", i64 %"77", i64 %"78")
  br label %"85"
 "85":                                             ; preds = %"84"
  %"112" = load i64, ptr addrspace(5) %"88", align 4
  %"122" = inttoptr i64 %"112" to ptr
  %"111" = load i64, ptr %"122", align 4
  store i64 %"111", ptr addrspace(5) %"90", align 4
  %"114" = load i64, ptr addrspace(5) %"90", align 4
  %"113" = add i64 %"114", 1
  store i64 %"113", ptr addrspace(5) %"91", align 4
  %"115" = load i64, ptr addrspace(5) %"89", align 4
  %"116" = load i64, ptr addrspace(5) %"91", align 4
  %"123" = inttoptr i64 %"115" to ptr
  store i64 %"116", ptr %"123", align 4
  ret void
 }
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
 attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
--- a/ptx/src/test/ll/atom_add.ll
+++ b/ptx/src/test/ll/atom_add.ll
@ -0,0 +1,46 @@
@shared_mem = external addrspace(3) global [1024 x i8], align 4
 define amdgpu_kernel void @atom_add(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
  %"38" = alloca i64, align 8, addrspace(5)
  %"39" = alloca i64, align 8, addrspace(5)
  %"40" = alloca i32, align 4, addrspace(5)
  %"41" = alloca i32, align 4, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"35"
 "35":                                             ; preds = %1
  %"42" = load i64, ptr addrspace(4) %"36", align 4
  store i64 %"42", ptr addrspace(5) %"38", align 4
  %"43" = load i64, ptr addrspace(4) %"37", align 4
  store i64 %"43", ptr addrspace(5) %"39", align 4
  %"45" = load i64, ptr addrspace(5) %"38", align 4
  %"56" = inttoptr i64 %"45" to ptr
  %"44" = load i32, ptr %"56", align 4
  store i32 %"44", ptr addrspace(5) %"40", align 4
  %"46" = load i64, ptr addrspace(5) %"38", align 4
  %"57" = inttoptr i64 %"46" to ptr
  %"32" = getelementptr inbounds i8, ptr %"57", i64 4
  %"47" = load i32, ptr %"32", align 4
  store i32 %"47", ptr addrspace(5) %"41", align 4
  %"48" = load i32, ptr addrspace(5) %"40", align 4
  store i32 %"48", ptr addrspace(3) @shared_mem, align 4
  %"50" = load i32, ptr addrspace(5) %"41", align 4
  %2 = atomicrmw add ptr addrspace(3) @shared_mem, i32 %"50" syncscope("agent-one-as") monotonic, align 4
  store i32 %2, ptr addrspace(5) %"40", align 4
  %"51" = load i32, ptr addrspace(3) @shared_mem, align 4
  store i32 %"51", ptr addrspace(5) %"41", align 4
  %"52" = load i64, ptr addrspace(5) %"39", align 4
  %"53" = load i32, ptr addrspace(5) %"40", align 4
  %"61" = inttoptr i64 %"52" to ptr
  store i32 %"53", ptr %"61", align 4
  %"54" = load i64, ptr addrspace(5) %"39", align 4
  %"62" = inttoptr i64 %"54" to ptr
  %"34" = getelementptr inbounds i8, ptr %"62", i64 4
  %"55" = load i32, ptr addrspace(5) %"41", align 4
  store i32 %"55", ptr %"34", align 4
  ret void
 }
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
--- a/ptx/src/test/ll/atom_add_float.ll
+++ b/ptx/src/test/ll/atom_add_float.ll
@ -0,0 +1,46 @@
@shared_mem = external addrspace(3) global [1024 x i8], align 4
 define amdgpu_kernel void @atom_add_float(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
  %"38" = alloca i64, align 8, addrspace(5)
  %"39" = alloca i64, align 8, addrspace(5)
  %"40" = alloca float, align 4, addrspace(5)
  %"41" = alloca float, align 4, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"35"
 "35":                                             ; preds = %1
  %"42" = load i64, ptr addrspace(4) %"36", align 4
  store i64 %"42", ptr addrspace(5) %"38", align 4
  %"43" = load i64, ptr addrspace(4) %"37", align 4
  store i64 %"43", ptr addrspace(5) %"39", align 4
  %"45" = load i64, ptr addrspace(5) %"38", align 4
  %"56" = inttoptr i64 %"45" to ptr
  %"44" = load float, ptr %"56", align 4
  store float %"44", ptr addrspace(5) %"40", align 4
  %"46" = load i64, ptr addrspace(5) %"38", align 4
  %"57" = inttoptr i64 %"46" to ptr
  %"32" = getelementptr inbounds i8, ptr %"57", i64 4
  %"47" = load float, ptr %"32", align 4
  store float %"47", ptr addrspace(5) %"41", align 4
  %"48" = load float, ptr addrspace(5) %"40", align 4
  store float %"48", ptr addrspace(3) @shared_mem, align 4
  %"50" = load float, ptr addrspace(5) %"41", align 4
  %2 = atomicrmw fadd ptr addrspace(3) @shared_mem, float %"50" syncscope("agent-one-as") monotonic, align 4
  store float %2, ptr addrspace(5) %"40", align 4
  %"51" = load float, ptr addrspace(3) @shared_mem, align 4
  store float %"51", ptr addrspace(5) %"41", align 4
  %"52" = load i64, ptr addrspace(5) %"39", align 4
  %"53" = load float, ptr addrspace(5) %"40", align 4
  %"61" = inttoptr i64 %"52" to ptr
  store float %"53", ptr %"61", align 4
  %"54" = load i64, ptr addrspace(5) %"39", align 4
  %"62" = inttoptr i64 %"54" to ptr
  %"34" = getelementptr inbounds i8, ptr %"62", i64 4
  %"55" = load float, ptr addrspace(5) %"41", align 4
  store float %"55", ptr %"34", align 4
  ret void
 }
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
--- a/ptx/src/test/ll/atom_cas.ll
+++ b/ptx/src/test/ll/atom_cas.ll
@ -0,0 +1,44 @@
 define amdgpu_kernel void @atom_cas(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 {
  %"40" = alloca i64, align 8, addrspace(5)
  %"41" = alloca i64, align 8, addrspace(5)
  %"42" = alloca i32, align 4, addrspace(5)
  %"43" = alloca i32, align 4, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"37"
 "37":                                             ; preds = %1
  %"44" = load i64, ptr addrspace(4) %"38", align 4
  store i64 %"44", ptr addrspace(5) %"40", align 4
  %"45" = load i64, ptr addrspace(4) %"39", align 4
  store i64 %"45", ptr addrspace(5) %"41", align 4
  %"47" = load i64, ptr addrspace(5) %"40", align 4
  %"57" = inttoptr i64 %"47" to ptr
  %"46" = load i32, ptr %"57", align 4
  store i32 %"46", ptr addrspace(5) %"42", align 4
  %"48" = load i64, ptr addrspace(5) %"40", align 4
  %"58" = inttoptr i64 %"48" to ptr
  %"31" = getelementptr inbounds i8, ptr %"58", i64 4
  %"50" = load i32, ptr addrspace(5) %"42", align 4
  %2 = cmpxchg ptr %"31", i32 %"50", i32 100 syncscope("agent-one-as") monotonic monotonic, align 4
  %"59" = extractvalue { i32, i1 } %2, 0
  store i32 %"59", ptr addrspace(5) %"42", align 4
  %"51" = load i64, ptr addrspace(5) %"40", align 4
  %"61" = inttoptr i64 %"51" to ptr
  %"34" = getelementptr inbounds i8, ptr %"61", i64 4
  %"52" = load i32, ptr %"34", align 4
  store i32 %"52", ptr addrspace(5) %"43", align 4
  %"53" = load i64, ptr addrspace(5) %"41", align 4
  %"54" = load i32, ptr addrspace(5) %"42", align 4
  %"62" = inttoptr i64 %"53" to ptr
  store i32 %"54", ptr %"62", align 4
  %"55" = load i64, ptr addrspace(5) %"41", align 4
  %"63" = inttoptr i64 %"55" to ptr
  %"36" = getelementptr inbounds i8, ptr %"63", i64 4
  %"56" = load i32, ptr addrspace(5) %"43", align 4
  store i32 %"56", ptr %"36", align 4
  ret void
 }
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
--- a/ptx/src/test/ll/atom_inc.ll
+++ b/ptx/src/test/ll/atom_inc.ll
@ -0,0 +1,46 @@
 define amdgpu_kernel void @atom_inc(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 {
  %"40" = alloca i64, align 8, addrspace(5)
  %"41" = alloca i64, align 8, addrspace(5)
  %"42" = alloca i32, align 4, addrspace(5)
  %"43" = alloca i32, align 4, addrspace(5)
  %"44" = alloca i32, align 4, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"37"
 "37":                                             ; preds = %1
  %"45" = load i64, ptr addrspace(4) %"38", align 4
  store i64 %"45", ptr addrspace(5) %"40", align 4
  %"46" = load i64, ptr addrspace(4) %"39", align 4
  store i64 %"46", ptr addrspace(5) %"41", align 4
  %"48" = load i64, ptr addrspace(5) %"40", align 4
  %"59" = inttoptr i64 %"48" to ptr
  %2 = atomicrmw uinc_wrap ptr %"59", i32 101 syncscope("agent-one-as") monotonic, align 4
  store i32 %2, ptr addrspace(5) %"42", align 4
  %"50" = load i64, ptr addrspace(5) %"40", align 4
  %"60" = inttoptr i64 %"50" to ptr addrspace(1)
  %3 = atomicrmw uinc_wrap ptr addrspace(1) %"60", i32 101 syncscope("agent-one-as") monotonic, align 4
  store i32 %3, ptr addrspace(5) %"43", align 4
  %"52" = load i64, ptr addrspace(5) %"40", align 4
  %"61" = inttoptr i64 %"52" to ptr
  %"51" = load i32, ptr %"61", align 4
  store i32 %"51", ptr addrspace(5) %"44", align 4
  %"53" = load i64, ptr addrspace(5) %"41", align 4
  %"54" = load i32, ptr addrspace(5) %"42", align 4
  %"62" = inttoptr i64 %"53" to ptr
  store i32 %"54", ptr %"62", align 4
  %"55" = load i64, ptr addrspace(5) %"41", align 4
  %"63" = inttoptr i64 %"55" to ptr
  %"34" = getelementptr inbounds i8, ptr %"63", i64 4
  %"56" = load i32, ptr addrspace(5) %"43", align 4
  store i32 %"56", ptr %"34", align 4
  %"57" = load i64, ptr addrspace(5) %"41", align 4
  %"64" = inttoptr i64 %"57" to ptr
  %"36" = getelementptr inbounds i8, ptr %"64", i64 8
  %"58" = load i32, ptr addrspace(5) %"44", align 4
  store i32 %"58", ptr %"36", align 4
  ret void
 }
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
--- a/ptx/src/test/ll/b64tof64.ll
+++ b/ptx/src/test/ll/b64tof64.ll
@ -0,0 +1,30 @@
 define amdgpu_kernel void @b64tof64(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
  %"33" = alloca double, align 8, addrspace(5)
  %"34" = alloca i64, align 8, addrspace(5)
  %"35" = alloca i64, align 8, addrspace(5)
  %"36" = alloca i64, align 8, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"30"
 "30":                                             ; preds = %1
  %"37" = load double, ptr addrspace(4) %"31", align 8
  store double %"37", ptr addrspace(5) %"33", align 8
  %"38" = load i64, ptr addrspace(4) %"32", align 4
  store i64 %"38", ptr addrspace(5) %"35", align 4
  %"40" = load double, ptr addrspace(5) %"33", align 8
  %"46" = bitcast double %"40" to i64
  store i64 %"46", ptr addrspace(5) %"34", align 4
  %"42" = load i64, ptr addrspace(5) %"34", align 4
  %"47" = inttoptr i64 %"42" to ptr
  %"41" = load i64, ptr %"47", align 4
  store i64 %"41", ptr addrspace(5) %"36", align 4
  %"43" = load i64, ptr addrspace(5) %"35", align 4
  %"44" = load i64, ptr addrspace(5) %"36", align 4
  %"48" = inttoptr i64 %"43" to ptr
  store i64 %"44", ptr %"48", align 4
  ret void
 }
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
--- a/ptx/src/test/ll/bench.ll
+++ b/ptx/src/test/ll/bench.ll
@ -0,0 +1,91 @@
 declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
 declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
 declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
 declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
 declare i32 @__zluda_ptx_impl_sreg_clock() #0
 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
 define amdgpu_kernel void @bench(ptr addrspace(4) byref(i64) %"55", ptr addrspace(4) byref(i64) %"56") #1 {
  %"57" = alloca i64, align 8, addrspace(5)
  %"58" = alloca i64, align 8, addrspace(5)
  %"59" = alloca float, align 4, addrspace(5)
  %"60" = alloca float, align 4, addrspace(5)
  %"61" = alloca float, align 4, addrspace(5)
  %"62" = alloca float, align 4, addrspace(5)
  %"63" = alloca i32, align 4, addrspace(5)
  %"64" = alloca i1, align 1, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"97"
 "97":                                             ; preds = %1
  %"65" = load i64, ptr addrspace(4) %"55", align 4
  store i64 %"65", ptr addrspace(5) %"57", align 4
  %"66" = load i64, ptr addrspace(4) %"56", align 4
  store i64 %"66", ptr addrspace(5) %"58", align 4
  %"68" = load i64, ptr addrspace(5) %"57", align 4
  %"91" = inttoptr i64 %"68" to ptr
  %"67" = load float, ptr %"91", align 4
  store float %"67", ptr addrspace(5) %"59", align 4
  %"69" = load i64, ptr addrspace(5) %"57", align 4
  %"92" = inttoptr i64 %"69" to ptr
  %"39" = getelementptr inbounds i8, ptr %"92", i64 4
  %"70" = load float, ptr %"39", align 4
  store float %"70", ptr addrspace(5) %"60", align 4
  %"71" = load i64, ptr addrspace(5) %"57", align 4
  %"93" = inttoptr i64 %"71" to ptr
  %"41" = getelementptr inbounds i8, ptr %"93", i64 8
  %"72" = load float, ptr %"41", align 4
  store float %"72", ptr addrspace(5) %"61", align 4
  %"73" = load i64, ptr addrspace(5) %"57", align 4
  %"94" = inttoptr i64 %"73" to ptr
  %"43" = getelementptr inbounds i8, ptr %"94", i64 12
  %"74" = load float, ptr %"43", align 4
  store float %"74", ptr addrspace(5) %"62", align 4
  store i32 0, ptr addrspace(5) %"63", align 4
  br label %"10"
 "10":                                             ; preds = %"21", %"97"
  %"77" = load float, ptr addrspace(5) %"59", align 4
  %"78" = load float, ptr addrspace(5) %"60", align 4
  call void asm sideeffect "s_denorm_mode 0", "~{mode}"()
  %"76" = fmul float %"77", %"78"
  store float %"76", ptr addrspace(5) %"59", align 4
  %"80" = load float, ptr addrspace(5) %"61", align 4
  %"81" = load float, ptr addrspace(5) %"62", align 4
  call void asm sideeffect "s_denorm_mode 11", "~{mode}"()
  %"79" = fmul float %"80", %"81"
  store float %"79", ptr addrspace(5) %"61", align 4
  %"83" = load i32, ptr addrspace(5) %"63", align 4
  %"82" = add i32 %"83", 1
  store i32 %"82", ptr addrspace(5) %"63", align 4
  %"85" = load i32, ptr addrspace(5) %"63", align 4
  %"84" = icmp eq i32 %"85", 100000000
  store i1 %"84", ptr addrspace(5) %"64", align 1
  %"86" = load i1, ptr addrspace(5) %"64", align 1
  br i1 %"86", label %"11", label %"21"
 "21":                                             ; preds = %"10"
  br label %"10"
 "11":                                             ; preds = %"10"
  %"87" = load i64, ptr addrspace(5) %"58", align 4
  %"88" = load float, ptr addrspace(5) %"59", align 4
  %"95" = inttoptr i64 %"87" to ptr
  store float %"88", ptr %"95", align 4
  %"89" = load i64, ptr addrspace(5) %"58", align 4
  %"96" = inttoptr i64 %"89" to ptr
  %"48" = getelementptr inbounds i8, ptr %"96", i64 4
  %"90" = load float, ptr addrspace(5) %"61", align 4
  store float %"90", ptr %"48", align 4
  ret void
 }
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
 attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
--- a/ptx/src/test/ll/bfe.ll
+++ b/ptx/src/test/ll/bfe.ll
@ -0,0 +1,46 @@
 declare i32 @__zluda_ptx_impl_bfe_u32(i32, i32, i32) #0
 define amdgpu_kernel void @bfe(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 {
  %"38" = alloca i64, align 8, addrspace(5)
  %"39" = alloca i64, align 8, addrspace(5)
  %"40" = alloca i32, align 4, addrspace(5)
  %"41" = alloca i32, align 4, addrspace(5)
  %"42" = alloca i32, align 4, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"35"
 "35":                                             ; preds = %1
  %"43" = load i64, ptr addrspace(4) %"36", align 4
  store i64 %"43", ptr addrspace(5) %"38", align 4
  %"44" = load i64, ptr addrspace(4) %"37", align 4
  store i64 %"44", ptr addrspace(5) %"39", align 4
  %"46" = load i64, ptr addrspace(5) %"38", align 4
  %"57" = inttoptr i64 %"46" to ptr
  %"45" = load i32, ptr %"57", align 4
  store i32 %"45", ptr addrspace(5) %"40", align 4
  %"47" = load i64, ptr addrspace(5) %"38", align 4
  %"58" = inttoptr i64 %"47" to ptr
  %"32" = getelementptr inbounds i8, ptr %"58", i64 4
  %"48" = load i32, ptr %"32", align 4
  store i32 %"48", ptr addrspace(5) %"41", align 4
  %"49" = load i64, ptr addrspace(5) %"38", align 4
  %"59" = inttoptr i64 %"49" to ptr
  %"34" = getelementptr inbounds i8, ptr %"59", i64 8
  %"50" = load i32, ptr %"34", align 4
  store i32 %"50", ptr addrspace(5) %"42", align 4
  %"52" = load i32, ptr addrspace(5) %"40", align 4
  %"53" = load i32, ptr addrspace(5) %"41", align 4
  %"54" = load i32, ptr addrspace(5) %"42", align 4
  %"51" = call i32 @__zluda_ptx_impl_bfe_u32(i32 %"52", i32 %"53", i32 %"54")
  store i32 %"51", ptr addrspace(5) %"40", align 4
  %"55" = load i64, ptr addrspace(5) %"39", align 4
  %"56" = load i32, ptr addrspace(5) %"40", align 4
  %"60" = inttoptr i64 %"55" to ptr
  store i32 %"56", ptr %"60", align 4
  ret void
 }
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
 attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
--- a/ptx/src/test/ll/bfi.ll
+++ b/ptx/src/test/ll/bfi.ll
@ -0,0 +1,53 @@
 declare i32 @__zluda_ptx_impl_bfi_b32(i32, i32, i32, i32) #0
 define amdgpu_kernel void @bfi(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 {
  %"41" = alloca i64, align 8, addrspace(5)
  %"42" = alloca i64, align 8, addrspace(5)
  %"43" = alloca i32, align 4, addrspace(5)
  %"44" = alloca i32, align 4, addrspace(5)
  %"45" = alloca i32, align 4, addrspace(5)
  %"46" = alloca i32, align 4, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"38"
 "38":                                             ; preds = %1
  %"47" = load i64, ptr addrspace(4) %"39", align 4
  store i64 %"47", ptr addrspace(5) %"41", align 4
  %"48" = load i64, ptr addrspace(4) %"40", align 4
  store i64 %"48", ptr addrspace(5) %"42", align 4
  %"50" = load i64, ptr addrspace(5) %"41", align 4
  %"64" = inttoptr i64 %"50" to ptr
  %"49" = load i32, ptr %"64", align 4
  store i32 %"49", ptr addrspace(5) %"43", align 4
  %"51" = load i64, ptr addrspace(5) %"41", align 4
  %"65" = inttoptr i64 %"51" to ptr
  %"33" = getelementptr inbounds i8, ptr %"65", i64 4
  %"52" = load i32, ptr %"33", align 4
  store i32 %"52", ptr addrspace(5) %"44", align 4
  %"53" = load i64, ptr addrspace(5) %"41", align 4
  %"66" = inttoptr i64 %"53" to ptr
  %"35" = getelementptr inbounds i8, ptr %"66", i64 8
  %"54" = load i32, ptr %"35", align 4
  store i32 %"54", ptr addrspace(5) %"45", align 4
  %"55" = load i64, ptr addrspace(5) %"41", align 4
  %"67" = inttoptr i64 %"55" to ptr
  %"37" = getelementptr inbounds i8, ptr %"67", i64 12
  %"56" = load i32, ptr %"37", align 4
  store i32 %"56", ptr addrspace(5) %"46", align 4
  %"58" = load i32, ptr addrspace(5) %"43", align 4
  %"59" = load i32, ptr addrspace(5) %"44", align 4
  %"60" = load i32, ptr addrspace(5) %"45", align 4
  %"61" = load i32, ptr addrspace(5) %"46", align 4
  %"68" = call i32 @__zluda_ptx_impl_bfi_b32(i32 %"58", i32 %"59", i32 %"60", i32 %"61")
  store i32 %"68", ptr addrspace(5) %"43", align 4
  %"62" = load i64, ptr addrspace(5) %"42", align 4
  %"63" = load i32, ptr addrspace(5) %"43", align 4
  %"71" = inttoptr i64 %"62" to ptr
  store i32 %"63", ptr %"71", align 4
  ret void
 }
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
 attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
--- a/ptx/src/test/ll/block.ll
+++ b/ptx/src/test/ll/block.ll
@ -0,0 +1,34 @@
 define amdgpu_kernel void @block(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
  %"36" = alloca i64, align 8, addrspace(5)
  %"37" = alloca i64, align 8, addrspace(5)
  %"38" = alloca i64, align 8, addrspace(5)
  %"39" = alloca i64, align 8, addrspace(5)
  %"46" = alloca i64, align 8, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"33"
 "33":                                             ; preds = %1
  %"40" = load i64, ptr addrspace(4) %"34", align 4
  store i64 %"40", ptr addrspace(5) %"36", align 4
  %"41" = load i64, ptr addrspace(4) %"35", align 4
  store i64 %"41", ptr addrspace(5) %"37", align 4
  %"43" = load i64, ptr addrspace(5) %"36", align 4
  %"51" = inttoptr i64 %"43" to ptr
  %"42" = load i64, ptr %"51", align 4
  store i64 %"42", ptr addrspace(5) %"38", align 4
  %"45" = load i64, ptr addrspace(5) %"38", align 4
  %"44" = add i64 %"45", 1
  store i64 %"44", ptr addrspace(5) %"39", align 4
  %"48" = load i64, ptr addrspace(5) %"46", align 4
  %"47" = add i64 %"48", 1
  store i64 %"47", ptr addrspace(5) %"46", align 4
  %"49" = load i64, ptr addrspace(5) %"37", align 4
  %"50" = load i64, ptr addrspace(5) %"39", align 4
  %"52" = inttoptr i64 %"49" to ptr
  store i64 %"50", ptr %"52", align 4
  ret void
 }
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
--- a/ptx/src/test/ll/bra.ll
+++ b/ptx/src/test/ll/bra.ll
@ -0,0 +1,36 @@
 define amdgpu_kernel void @bra(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
  %"38" = alloca i64, align 8, addrspace(5)
  %"39" = alloca i64, align 8, addrspace(5)
  %"40" = alloca i64, align 8, addrspace(5)
  %"41" = alloca i64, align 8, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"35"
 "35":                                             ; preds = %1
  %"42" = load i64, ptr addrspace(4) %"36", align 4
  store i64 %"42", ptr addrspace(5) %"38", align 4
  %"43" = load i64, ptr addrspace(4) %"37", align 4
  store i64 %"43", ptr addrspace(5) %"39", align 4
  %"45" = load i64, ptr addrspace(5) %"38", align 4
  %"50" = inttoptr i64 %"45" to ptr
  %"44" = load i64, ptr %"50", align 4
  store i64 %"44", ptr addrspace(5) %"40", align 4
  br label %"10"
 "10":                                             ; preds = %"35"
  %"47" = load i64, ptr addrspace(5) %"40", align 4
  %"46" = add i64 %"47", 1
  store i64 %"46", ptr addrspace(5) %"41", align 4
  br label %"12"
 "12":                                             ; preds = %"10"
  %"48" = load i64, ptr addrspace(5) %"39", align 4
  %"49" = load i64, ptr addrspace(5) %"41", align 4
  %"51" = inttoptr i64 %"48" to ptr
  store i64 %"49", ptr %"51", align 4
  ret void
 }
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
--- a/ptx/src/test/ll/brev.ll
+++ b/ptx/src/test/ll/brev.ll
@ -0,0 +1,33 @@
 define amdgpu_kernel void @brev(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
  %"32" = alloca i64, align 8, addrspace(5)
  %"33" = alloca i64, align 8, addrspace(5)
  %"34" = alloca i32, align 4, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"29"
 "29":                                             ; preds = %1
  %"35" = load i64, ptr addrspace(4) %"30", align 4
  store i64 %"35", ptr addrspace(5) %"32", align 4
  %"36" = load i64, ptr addrspace(4) %"31", align 4
  store i64 %"36", ptr addrspace(5) %"33", align 4
  %"38" = load i64, ptr addrspace(5) %"32", align 4
  %"43" = inttoptr i64 %"38" to ptr
  %"37" = load i32, ptr %"43", align 4
  store i32 %"37", ptr addrspace(5) %"34", align 4
  %"40" = load i32, ptr addrspace(5) %"34", align 4
  %"39" = call i32 @llvm.bitreverse.i32(i32 %"40")
  store i32 %"39", ptr addrspace(5) %"34", align 4
  %"41" = load i64, ptr addrspace(5) %"33", align 4
  %"42" = load i32, ptr addrspace(5) %"34", align 4
  %"44" = inttoptr i64 %"41" to ptr
  store i32 %"42", ptr %"44", align 4
  ret void
 }
 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
 declare i32 @llvm.bitreverse.i32(i32) #1
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
 attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
--- a/ptx/src/test/ll/call.ll
+++ b/ptx/src/test/ll/call.ll
@ -0,0 +1,64 @@
 define i64 @incr(i64 %"43") #0 {
  %"63" = alloca i64, align 8, addrspace(5)
  %"64" = alloca i64, align 8, addrspace(5)
  %"65" = alloca i64, align 8, addrspace(5)
  %"66" = alloca i64, align 8, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"46"
 "46":                                             ; preds = %1
  store i64 %"43", ptr addrspace(5) %"65", align 4
  %"67" = load i64, ptr addrspace(5) %"65", align 4
  store i64 %"67", ptr addrspace(5) %"66", align 4
  %"69" = load i64, ptr addrspace(5) %"66", align 4
  %"68" = add i64 %"69", 1
  store i64 %"68", ptr addrspace(5) %"66", align 4
  %"70" = load i64, ptr addrspace(5) %"66", align 4
  store i64 %"70", ptr addrspace(5) %"64", align 4
  %"71" = load i64, ptr addrspace(5) %"64", align 4
  store i64 %"71", ptr addrspace(5) %"63", align 4
  %2 = load i64, ptr addrspace(5) %"63", align 4
  ret i64 %2
 }
 define amdgpu_kernel void @call(ptr addrspace(4) byref(i64) %"48", ptr addrspace(4) byref(i64) %"49") #1 {
  %"50" = alloca i64, align 8, addrspace(5)
  %"51" = alloca i64, align 8, addrspace(5)
  %"52" = alloca i64, align 8, addrspace(5)
  %"57" = alloca i64, align 8, addrspace(5)
  %"58" = alloca i64, align 8, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"44"
 "44":                                             ; preds = %1
  %"53" = load i64, ptr addrspace(4) %"48", align 4
  store i64 %"53", ptr addrspace(5) %"50", align 4
  %"54" = load i64, ptr addrspace(4) %"49", align 4
  store i64 %"54", ptr addrspace(5) %"51", align 4
  %"56" = load i64, ptr addrspace(5) %"50", align 4
  %"72" = inttoptr i64 %"56" to ptr addrspace(1)
  %"55" = load i64, ptr addrspace(1) %"72", align 4
  store i64 %"55", ptr addrspace(5) %"52", align 4
  %"59" = load i64, ptr addrspace(5) %"52", align 4
  store i64 %"59", ptr addrspace(5) %"57", align 4
  %"40" = load i64, ptr addrspace(5) %"57", align 4
  %"41" = call i64 @incr(i64 %"40")
  br label %"45"
 "45":                                             ; preds = %"44"
  store i64 %"41", ptr addrspace(5) %"58", align 4
  %"60" = load i64, ptr addrspace(5) %"58", align 4
  store i64 %"60", ptr addrspace(5) %"52", align 4
  %"61" = load i64, ptr addrspace(5) %"51", align 4
  %"62" = load i64, ptr addrspace(5) %"52", align 4
  %"75" = inttoptr i64 %"61" to ptr addrspace(1)
  store i64 %"62", ptr addrspace(1) %"75", align 4
  ret void
 }
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
 attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
--- a/ptx/src/test/ll/call_rnd.ll
+++ b/ptx/src/test/ll/call_rnd.ll
@ -0,0 +1,155 @@
 define float @add_rm(float %"79", float %"80") #0 {
  %"128" = alloca float, align 4, addrspace(5)
  %"129" = alloca float, align 4, addrspace(5)
  %"130" = alloca float, align 4, addrspace(5)
  %"131" = alloca float, align 4, addrspace(5)
  %"132" = alloca float, align 4, addrspace(5)
  %"133" = alloca float, align 4, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"89"
 "89":                                             ; preds = %1
  call void @llvm.amdgcn.s.setreg(i32 6145, i32 2)
  br label %"87"
 "87":                                             ; preds = %"89"
  store float %"79", ptr addrspace(5) %"130", align 4
  store float %"80", ptr addrspace(5) %"131", align 4
  %"134" = load float, ptr addrspace(5) %"130", align 4
  store float %"134", ptr addrspace(5) %"132", align 4
  %"135" = load float, ptr addrspace(5) %"131", align 4
  store float %"135", ptr addrspace(5) %"133", align 4
  %"137" = load float, ptr addrspace(5) %"132", align 4
  %"138" = load float, ptr addrspace(5) %"133", align 4
  %"136" = fadd float %"137", %"138"
  store float %"136", ptr addrspace(5) %"132", align 4
  %"139" = load float, ptr addrspace(5) %"132", align 4
  store float %"139", ptr addrspace(5) %"129", align 4
  %"140" = load float, ptr addrspace(5) %"129", align 4
  store float %"140", ptr addrspace(5) %"128", align 4
  %2 = load float, ptr addrspace(5) %"128", align 4
  ret float %2
 }
 define float @add_rp(float %"82", float %"83") #0 {
  %"141" = alloca float, align 4, addrspace(5)
  %"142" = alloca float, align 4, addrspace(5)
  %"143" = alloca float, align 4, addrspace(5)
  %"144" = alloca float, align 4, addrspace(5)
  %"145" = alloca float, align 4, addrspace(5)
  %"146" = alloca float, align 4, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"88"
 "88":                                             ; preds = %1
  store float %"82", ptr addrspace(5) %"143", align 4
  store float %"83", ptr addrspace(5) %"144", align 4
  %"147" = load float, ptr addrspace(5) %"143", align 4
  store float %"147", ptr addrspace(5) %"145", align 4
  %"148" = load float, ptr addrspace(5) %"144", align 4
  store float %"148", ptr addrspace(5) %"146", align 4
  %"150" = load float, ptr addrspace(5) %"145", align 4
  %"151" = load float, ptr addrspace(5) %"146", align 4
  %"149" = fadd float %"150", %"151"
  store float %"149", ptr addrspace(5) %"145", align 4
  %"152" = load float, ptr addrspace(5) %"145", align 4
  store float %"152", ptr addrspace(5) %"142", align 4
  %"153" = load float, ptr addrspace(5) %"142", align 4
  store float %"153", ptr addrspace(5) %"141", align 4
  %2 = load float, ptr addrspace(5) %"141", align 4
  ret float %2
 }
 define amdgpu_kernel void @call_rnd(ptr addrspace(4) byref(i64) %"92", ptr addrspace(4) byref(i64) %"93") #1 {
  %"94" = alloca i64, align 8, addrspace(5)
  %"95" = alloca i64, align 8, addrspace(5)
  %"96" = alloca float, align 4, addrspace(5)
  %"97" = alloca float, align 4, addrspace(5)
  %"98" = alloca float, align 4, addrspace(5)
  %"99" = alloca float, align 4, addrspace(5)
  %"100" = alloca float, align 4, addrspace(5)
  %"101" = alloca float, align 4, addrspace(5)
  %"102" = alloca float, align 4, addrspace(5)
  %"103" = alloca float, align 4, addrspace(5)
  %"104" = alloca float, align 4, addrspace(5)
  %"105" = alloca float, align 4, addrspace(5)
  %"106" = alloca float, align 4, addrspace(5)
  %"107" = alloca float, align 4, addrspace(5)
  br label %1
 1:                                                ; preds = %0
  br label %"84"
 "84":                                             ; preds = %1
  call void @llvm.amdgcn.s.setreg(i32 6145, i32 1)
  %"108" = load i64, ptr addrspace(4) %"92", align 4
  store i64 %"108", ptr addrspace(5) %"94", align 4
  %"109" = load i64, ptr addrspace(4) %"93", align 4
  store i64 %"109", ptr addrspace(5) %"95", align 4
  %"111" = load i64, ptr addrspace(5) %"94", align 4
  %"154" = inttoptr i64 %"111" to ptr
  %"110" = load float, ptr %"154", align 4
  store float %"110", ptr addrspace(5) %"96", align 4
  %"112" = load i64, ptr addrspace(5) %"94", align 4
  %"155" = inttoptr i64 %"112" to ptr
  %"59" = getelementptr inbounds i8, ptr %"155", i64 4
  %"113" = load float, ptr %"59", align 4
  store float %"113", ptr addrspace(5) %"97", align 4
  %"114" = load i64, ptr addrspace(5) %"94", align 4
  %"156" = inttoptr i64 %"114" to ptr
  %"61" = getelementptr inbounds i8, ptr %"156", i64 8
  %"115" = load float, ptr %"61", align 4
  store float %"115", ptr addrspace(5) %"98", align 4
  %"116" = load i64, ptr addrspace(5) %"94", align 4
  %"157" = inttoptr i64 %"116" to ptr
  %"63" = getelementptr inbounds i8, ptr %"157", i64 12
  %"117" = load float, ptr %"63", align 4
  store float %"117", ptr addrspace(5) %"99", align 4
  %"118" = load float, ptr addrspace(5) %"96", align 4
  store float %"118", ptr addrspace(5) %"102", align 4
  %"119" = load float, ptr addrspace(5) %"97", align 4
  store float %"119", ptr addrspace(5) %"103", align 4
  %"72" = load float, ptr addrspace(5) %"102", align 4
  %"73" = load float, ptr addrspace(5) %"103", align 4
  %"74" = call float @add_rp(float %"72", float %"73")
  br label %"85"
 "85":                                             ; preds = %"84"
  store float %"74", ptr addrspace(5) %"104", align 4
  %"120" = load float, ptr addrspace(5) %"104", align 4
  store float %"120", ptr addrspace(5) %"100", align 4
  %"121" = load i64, ptr addrspace(5) %"95", align 4
  %"122" = load float, ptr addrspace(5) %"100", align 4
  %"158" = inttoptr i64 %"121" to ptr
  store float %"122", ptr %"158", align 4
  %"123" = load float, ptr addrspace(5) %"98", align 4
  store float %"123", ptr addrspace(5) %"105", align 4
  %"124" = load float, ptr addrspace(5) %"99", align 4
  store float %"124", ptr addrspace(5) %"106", align 4
  %"75" = load float, ptr addrspace(5) %"105", align 4
  %"76" = load float, ptr addrspace(5) %"106", align 4
  %"77" = call float @add_rm(float %"75", float %"76")
  br label %"86"
 "86":                                             ; preds = %"85"
  store float %"77", ptr addrspace(5) %"107", align 4
  %"125" = load float, ptr addrspace(5) %"107", align 4
  store float %"125", ptr addrspace(5) %"101", align 4
  %"126" = load i64, ptr addrspace(5) %"95", align 4
  %"159" = inttoptr i64 %"126" to ptr
  %"65" = getelementptr inbounds i8, ptr %"159", i64 4
  %"127" = load float, ptr addrspace(5) %"101", align 4
  store float %"127", ptr %"65", align 4
  ret void
 }
 ; Function Attrs: nocallback nofree nosync nounwind willreturn
 declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #2
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
 attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
 attributes #2 = { nocallback nofree nosync nounwind willreturn }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Violet	7bdd20f0dd	Add warp-wide tests (#400 )	2025-07-02 18:11:36 -07:00
Andrzej Janik	6d56fa8c34	Fix floating point min/max (#399 )	2025-07-01 15:58:16 -07:00
Violet	b824424367	Read test files at runtime for development ergonomics (#395 )	2025-07-01 10:31:06 -07:00
Violet	1cf345329c	Make `derive_parser` work with all optional arguments (#397 ) The current implementation using `winnow`'s `opt` does not work for optional arguments that are in the middle of the command. For example, `bar{.cta}.red.op.pred p, a{, b}, {!}c;`. This is because `opt` is greedy, and will always match `{, b}` instead of `,{!} c`. This change switches to using a custom combinator that handles this properly	2025-06-30 18:54:31 -07:00
aiwhskruht	d4ad17d75a	Unified fatbin versions behind a single iterator. (#398 )	2025-06-27 15:56:46 -07:00
Violet	80607c07db	Check LLVM IR for `test_ptx!` with no input/output (#394 )	2025-06-24 11:53:30 -07:00
Andrzej Janik	22608d7420	Bump dependencies (#392 ) zip 2.6.1 was yanked and microlp 2.10 has a major bug	2025-06-23 18:04:08 -07:00
Violet	5edfeb04eb	Error instead of infinite loop when parsing enum without a derive attribute in derive_parser! (#391 )	2025-06-23 16:18:21 -07:00
Violet	74ff9ebf96	Remove trailing zeroes from end of ptx (#390 )	2025-06-23 16:14:07 -07:00
Violet	f4cd545677	Fix bug in get_payload (#389 )	2025-06-18 17:29:21 -07:00
Violet	4da3978f94	Implement `cuLibraryLoadData` (#388 )	2025-06-18 16:05:53 -07:00
Violet	8ce70c5095	Add `integrity_check` implementation to ZLUDA (#387 )	2025-06-17 15:00:10 -07:00
Andrzej Janik	2a374ad880	Add fp saturation, fix various bugs in cvt instruction exposed by ptx_tests (#379 )	2025-06-16 19:14:16 -07:00
Violet	4d4053194a	Implement `runtime_callback_hooks_fn6` (#386 )	2025-06-16 17:00:47 -07:00
Violet	9c5f1ed9fb	Handle new attributes in `cuDeviceGetAttribute` (#383 )	2025-06-16 13:20:04 -07:00
Andrzej Janik	f179868b8e	Add automated builds (#358 )	2025-06-16 09:53:18 -07:00
Violet	9773d20945	Implement cudart_interface_fn2 (#382 )	2025-06-13 14:01:14 -07:00
Violet	1715830d82	Implement cuModuleGetLoadingMode (#381 )	2025-06-11 15:54:48 -07:00
Violet	25a9d1c40e	Implement runtime_callback_hooks_fn2 (#380 )	2025-06-11 15:15:43 -07:00
Violet	62f3e63355	Implement cuGetProcAddress and cuGetProcAddress_v2 (#377 )	2025-06-10 16:07:35 -07:00
Andrzej Janik	3361046760	Fix mad.wide, replace external CUDA library in test with our own (#376 )	2025-06-09 21:33:18 -07:00
Andrzej Janik	c790ab45ec	Redo logging to better log dark API and performance libraries (#372 )	2025-06-09 15:29:14 -07:00
Andrzej Janik	5935cfec78	Work around broken AMD Adrenalin 25.5.1 driver (#366 ) For reasons unknown AMD Adrenalin 25.5.1 ships with comgr that presents itself as version 2, but expects ABI for veersion 3. Add a workaround	2025-05-13 02:20:23 +02:00
Andrzej Janik	3d3e38aadc	Fix ROCm 6.4 failures (#364 ) Lazy load comgr and dispatch to different code paths based on the name of the comgr .dll/.so	2025-05-02 00:38:22 +02:00
Andrzej Janik	cc83b9f1f6	Create infrastructure for performance libraries (#363 )	2025-05-01 22:37:18 +02:00
Andrzej Janik	adc4673a20	Explicitly fail compilation on ROCm 6.4 (#361 ) AMD broke comgr ABI in 6.4. This is a temporary solution.	2025-04-20 17:02:05 +02:00
Joëlle van Essen	7cdab7abc2	Implement mul24 (#351 )	2025-04-08 12:27:19 +02:00
Andrzej Janik	d704e92c97	Support instruction modes (denormal and rounding) on AMD GPUs (#342 )	2025-03-17 21:37:26 +01:00
Joëlle van Essen	867e4728d5	LLVM unit tests (#324 ) * LLVM unit tests: add assembly files * LLVM unit tests: first attempt * LLVM unit tests: fix - parse bitcode in context * LLVM unit tests: use pretty_assertions for line-by-line diff * LLVM unit tests: Write IR to file for failed test * LLVM unit tests: just use the stack * LLVM unit tests: use MaybeUninit * LLVM unit tests: add mul24.ll * LLVM unit tests: Adjustments after review * LLVM unit tests: Include emit_llvm::Context in emit_llvm::Module * LLVM unit tests: Fix typo * LLVM unit tests: Context need not be pub	2025-02-19 21:21:20 +01:00
Andrzej Janik	646d746e02	Start working on mul24	2025-02-07 19:37:11 +00:00
Andrzej Janik	df5a96d935	Improve build system (#329 ) Also fix Dockerfile and Windows build	2025-01-28 01:55:36 +01:00
Alexander Zaitsev	9c0747a5f7	fix: missing inherits in a release-lto profile (#319 )	2025-01-03 16:58:19 +01:00
Alexander Zaitsev	fee20e54d9	feat: enable LTO and codegen-units = 1 optimization (#318 )	2025-01-02 19:07:39 +01:00
Joëlle van Essen	7399132d5d	Fix test in zluda_dump (#316 )	2025-01-01 23:02:59 +01:00
Andrzej Janik	ecd61a8e2a	Update README for version 4 (#315 )	2024-12-31 17:33:59 +01:00