From cc83b9f1f6c4c5667cd60139de946abbee410b5b Mon Sep 17 00:00:00 2001
From: Andrzej Janik <vosen@vosen.pl>
Date: Thu, 1 May 2025 22:37:18 +0200
Subject: [PATCH] Create infrastructure for performance libraries (#363)

---
 .devcontainer/Dockerfile                      |   23 +-
 .devcontainer/devcontainer.json               |    4 +-
 Cargo.lock                                    |   40 +
 Cargo.toml                                    |    5 +
 cuda_base/src/cublas.rs                       | 6861 +++++++++++++++++
 cuda_base/src/cublaslt.rs                     |  581 ++
 cuda_base/src/cuda.rs                         | 1751 ++++-
 cuda_base/src/cudnn8.rs                       | 2579 +++++++
 cuda_base/src/cudnn9.rs                       | 2055 +++++
 cuda_base/src/cufft.rs                        |  368 +
 cuda_base/src/cusparse.rs                     | 5518 +++++++++++++
 cuda_base/src/lib.rs                          |   30 +
 cuda_base/src/nvml.rs                         | 1192 ++-
 cuda_types/src/cublas.rs                      |  324 +
 cuda_types/src/cublaslt.rs                    | 5387 +++++++++++++
 cuda_types/src/cuda.rs                        |  940 ++-
 cuda_types/src/cudnn.rs                       | 1478 ++++
 cuda_types/src/cudnn8.rs                      |  576 ++
 cuda_types/src/cudnn9.rs                      | 2404 ++++++
 cuda_types/src/cufft.rs                       |  427 +
 cuda_types/src/cusparse.rs                    |  550 ++
 cuda_types/src/lib.rs                         |    9 +-
 cuda_types/src/nvml.rs                        | 1088 ++-
 ext/hip_runtime-sys/src/lib.rs                | 2906 ++++---
 ptx_parser/src/lib.rs                         |    8 +-
 zluda_bindgen/build/cuda_wrapper.h            |    2 +
 .../build/cudnn_v8/cudnn_adv_infer.h          |    1 +
 .../build/cudnn_v8/cudnn_adv_train.h          |    1 +
 zluda_bindgen/build/cudnn_v8/cudnn_backend.h  |    1 +
 .../build/cudnn_v8/cudnn_cnn_infer.h          |    1 +
 .../build/cudnn_v8/cudnn_cnn_train.h          |    1 +
 .../build/cudnn_v8/cudnn_ops_infer.h          |    1 +
 .../build/cudnn_v8/cudnn_ops_train.h          |    1 +
 zluda_bindgen/build/cudnn_v8/cudnn_version.h  |    1 +
 zluda_bindgen/build/cufft_wraper.h            |    2 +
 zluda_bindgen/src/main.rs                     |  598 +-
 zluda_blas/Cargo.toml                         |   17 +
 zluda_blas/src/impl.rs                        |   32 +
 zluda_blas/src/lib.rs                         |   37 +
 zluda_blaslt/Cargo.toml                       |   18 +
 zluda_blaslt/src/impl.rs                      |   42 +
 zluda_blaslt/src/lib.rs                       |   40 +
 zluda_dnn/Cargo.toml                          |   18 +
 zluda_dnn/src/impl.rs                         |   34 +
 zluda_dnn/src/lib.rs                          |   38 +
 zluda_dump/src/format.rs                      |  145 +-
 zluda_dump/src/format_generated.rs            | 1738 ++++-
 zluda_fft/Cargo.toml                          |   17 +
 zluda_fft/src/impl.rs                         |   11 +
 zluda_fft/src/lib.rs                          |   18 +
 zluda_sparse/Cargo.toml                       |   17 +
 zluda_sparse/src/impl.rs                      |   53 +
 zluda_sparse/src/lib.rs                       |   42 +
 53 files changed, 38361 insertions(+), 1670 deletions(-)
 create mode 100644 cuda_base/src/cublas.rs
 create mode 100644 cuda_base/src/cublaslt.rs
 create mode 100644 cuda_base/src/cudnn8.rs
 create mode 100644 cuda_base/src/cudnn9.rs
 create mode 100644 cuda_base/src/cufft.rs
 create mode 100644 cuda_base/src/cusparse.rs
 create mode 100644 cuda_types/src/cublas.rs
 create mode 100644 cuda_types/src/cublaslt.rs
 create mode 100644 cuda_types/src/cudnn.rs
 create mode 100644 cuda_types/src/cudnn8.rs
 create mode 100644 cuda_types/src/cudnn9.rs
 create mode 100644 cuda_types/src/cufft.rs
 create mode 100644 cuda_types/src/cusparse.rs
 create mode 100644 zluda_bindgen/build/cudnn_v8/cudnn_adv_infer.h
 create mode 100644 zluda_bindgen/build/cudnn_v8/cudnn_adv_train.h
 create mode 100644 zluda_bindgen/build/cudnn_v8/cudnn_backend.h
 create mode 100644 zluda_bindgen/build/cudnn_v8/cudnn_cnn_infer.h
 create mode 100644 zluda_bindgen/build/cudnn_v8/cudnn_cnn_train.h
 create mode 100644 zluda_bindgen/build/cudnn_v8/cudnn_ops_infer.h
 create mode 100644 zluda_bindgen/build/cudnn_v8/cudnn_ops_train.h
 create mode 100644 zluda_bindgen/build/cudnn_v8/cudnn_version.h
 create mode 100644 zluda_bindgen/build/cufft_wraper.h
 create mode 100644 zluda_blas/Cargo.toml
 create mode 100644 zluda_blas/src/impl.rs
 create mode 100644 zluda_blas/src/lib.rs
 create mode 100644 zluda_blaslt/Cargo.toml
 create mode 100644 zluda_blaslt/src/impl.rs
 create mode 100644 zluda_blaslt/src/lib.rs
 create mode 100644 zluda_dnn/Cargo.toml
 create mode 100644 zluda_dnn/src/impl.rs
 create mode 100644 zluda_dnn/src/lib.rs
 create mode 100644 zluda_fft/Cargo.toml
 create mode 100644 zluda_fft/src/impl.rs
 create mode 100644 zluda_fft/src/lib.rs
 create mode 100644 zluda_sparse/Cargo.toml
 create mode 100644 zluda_sparse/src/impl.rs
 create mode 100644 zluda_sparse/src/lib.rs

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index efe9d1c..955bf3c 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.4.1-base-ubuntu22.04
+FROM nvidia/cuda:12.8.1-base-ubuntu24.04
 
 RUN DEBIAN_FRONTEND=noninteractive apt-get update -y && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
     wget \
@@ -18,10 +18,14 @@ RUN wget https://apt.llvm.org/llvm.sh && \
     ./llvm.sh ${LLVM_VERSION}
 
 # Feel free to change to a newer version if you have a newer verison on your host
-ARG CUDA_PKG_VERSION=12-4
+ARG CUDA_PKG_VERSION=12-8
 # Docker <-> host  driver version compatiblity is newer host <-> older docker
-# We don't care about a specific driver version, so pick oldest 5XX
-ARG CUDA_DRIVER=515
+# We don't care about a specific driver version, so pick oldest 5XX compatible
+ARG CUDA_DRIVER=570
+RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcudnn8_8.9.7.29-1+cuda12.2_amd64.deb && \
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcudnn8-dev_8.9.7.29-1+cuda12.2_amd64.deb && \
+    dpkg -i libcudnn8_8.9.7.29-1+cuda12.2_amd64.deb libcudnn8-dev_8.9.7.29-1+cuda12.2_amd64.deb && \
+    rm libcudnn8_8.9.7.29-1+cuda12.2_amd64.deb libcudnn8-dev_8.9.7.29-1+cuda12.2_amd64.deb
 RUN DEBIAN_FRONTEND=noninteractive apt-get update -y && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
     # CUDA headers need it for interop
     libgl-dev libegl-dev libvdpau-dev \
@@ -30,13 +34,18 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get update -y && DEBIAN_FRONTEND=noninter
     cuda-nvml-dev-${CUDA_PKG_VERSION} \
     cuda-cudart-${CUDA_PKG_VERSION} \
     cuda-profiler-api-${CUDA_PKG_VERSION} \
-    cuda-nvcc-${CUDA_PKG_VERSION}
+    cuda-nvcc-${CUDA_PKG_VERSION} \
+    libcudnn8-dev \
+    cudnn9-cuda-${CUDA_PKG_VERSION} \
+    libcufft-dev-${CUDA_PKG_VERSION} \
+    libcublas-dev-${CUDA_PKG_VERSION} \
+    libcusparse-dev-${CUDA_PKG_VERSION}
 
-ARG ROCM_VERSION=6.3.1
+ARG ROCM_VERSION=6.4
 RUN mkdir --parents --mode=0755 /etc/apt/keyrings && \
     wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
     gpg --dearmor | tee /etc/apt/keyrings/rocm.gpg > /dev/null && \
-    echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${ROCM_VERSION} jammy main" > /etc/apt/sources.list.d/rocm.list && \
+    echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${ROCM_VERSION} noble main" > /etc/apt/sources.list.d/rocm.list && \
     echo 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' > /etc/apt/preferences.d/rocm-pin-600 && \
     DEBIAN_FRONTEND=noninteractive apt update -y && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
     rocminfo \
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 34e88fb..7c3c934 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -7,7 +7,7 @@
 	},
 	"securityOpt": [ "seccomp=unconfined" ],
 	"runArgs": [
-		"--runtime=nvidia",
+		//"--runtime=nvidia",
 		"--device=/dev/kfd",
 		"--device=/dev/dri",
 		"--group-add=video"
@@ -25,7 +25,7 @@
 	},
 	// https://aka.ms/dev-containers-non-root.
 	"remoteUser": "root",
-	//"hostRequirements": { "gpu": "optional" }
+	"hostRequirements": { "gpu": true },
 	"customizations": {
 		"vscode": {
 			"extensions": [ "mhutchie.git-graph" ]
diff --git a/Cargo.lock b/Cargo.lock
index 5726bb3..1d10122 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1343,6 +1343,30 @@ dependencies = [
  "syn 2.0.89",
 ]
 
+[[package]]
+name = "zluda_blas"
+version = "0.0.0"
+dependencies = [
+ "cuda_base",
+ "cuda_types",
+]
+
+[[package]]
+name = "zluda_blaslt"
+version = "0.0.0"
+dependencies = [
+ "cuda_base",
+ "cuda_types",
+]
+
+[[package]]
+name = "zluda_dnn"
+version = "0.0.0"
+dependencies = [
+ "cuda_base",
+ "cuda_types",
+]
+
 [[package]]
 name = "zluda_dump"
 version = "0.0.0"
@@ -1364,6 +1388,14 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "zluda_fft"
+version = "0.0.0"
+dependencies = [
+ "cuda_base",
+ "cuda_types",
+]
+
 [[package]]
 name = "zluda_inject"
 version = "0.0.0"
@@ -1393,3 +1425,11 @@ dependencies = [
  "wchar",
  "winapi",
 ]
+
+[[package]]
+name = "zluda_sparse"
+version = "0.0.0"
+dependencies = [
+ "cuda_base",
+ "cuda_types",
+]
diff --git a/Cargo.toml b/Cargo.toml
index 18fd140..875d36f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,6 +20,11 @@ members = [
     "ptx_parser_macros_impl",
     "xtask",
     "zluda_bindgen",
+    "zluda_dnn",
+    "zluda_blas",
+    "zluda_blaslt",
+    "zluda_fft",
+    "zluda_sparse",
 ]
 
 default-members = ["zluda", "zluda_ml", "zluda_inject", "zluda_redirect"]
diff --git a/cuda_base/src/cublas.rs b/cuda_base/src/cublas.rs
new file mode 100644
index 0000000..af6702e
--- /dev/null
+++ b/cuda_base/src/cublas.rs
@@ -0,0 +1,6861 @@
+// Generated automatically by zluda_bindgen
+// DO NOT EDIT MANUALLY
+#![allow(warnings)]
+extern "system" {
+    #[must_use]
+    fn cublasCreate_v2(
+        handle: *mut cuda_types::cublas::cublasHandle_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDestroy_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGetVersion_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        version: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGetProperty(
+        type_: cuda_types::cublas::libraryPropertyType,
+        value: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    fn cublasGetCudartVersion() -> usize;
+    #[must_use]
+    fn cublasSetWorkspace_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        workspace: *mut ::core::ffi::c_void,
+        workspaceSizeInBytes: usize,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSetStream_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        streamId: cuda_types::cublas::cudaStream_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGetStream_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        streamId: *mut cuda_types::cublas::cudaStream_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGetPointerMode_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        mode: *mut cuda_types::cublas::cublasPointerMode_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSetPointerMode_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        mode: cuda_types::cublas::cublasPointerMode_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGetAtomicsMode(
+        handle: cuda_types::cublas::cublasHandle_t,
+        mode: *mut cuda_types::cublas::cublasAtomicsMode_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSetAtomicsMode(
+        handle: cuda_types::cublas::cublasHandle_t,
+        mode: cuda_types::cublas::cublasAtomicsMode_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGetMathMode(
+        handle: cuda_types::cublas::cublasHandle_t,
+        mode: *mut cuda_types::cublas::cublasMath_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSetMathMode(
+        handle: cuda_types::cublas::cublasHandle_t,
+        mode: cuda_types::cublas::cublasMath_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGetSmCountTarget(
+        handle: cuda_types::cublas::cublasHandle_t,
+        smCountTarget: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSetSmCountTarget(
+        handle: cuda_types::cublas::cublasHandle_t,
+        smCountTarget: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    fn cublasGetStatusName(
+        status: cuda_types::cublas::cublasStatus_t,
+    ) -> *const ::core::ffi::c_char;
+    fn cublasGetStatusString(
+        status: cuda_types::cublas::cublasStatus_t,
+    ) -> *const ::core::ffi::c_char;
+    #[must_use]
+    fn cublasLoggerConfigure(
+        logIsOn: ::core::ffi::c_int,
+        logToStdOut: ::core::ffi::c_int,
+        logToStdErr: ::core::ffi::c_int,
+        logFileName: *const ::core::ffi::c_char,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSetLoggerCallback(
+        userCallback: cuda_types::cublas::cublasLogCallback,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGetLoggerCallback(
+        userCallback: *mut cuda_types::cublas::cublasLogCallback,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSetVector(
+        n: ::core::ffi::c_int,
+        elemSize: ::core::ffi::c_int,
+        x: *const ::core::ffi::c_void,
+        incx: ::core::ffi::c_int,
+        devicePtr: *mut ::core::ffi::c_void,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSetVector_64(
+        n: i64,
+        elemSize: i64,
+        x: *const ::core::ffi::c_void,
+        incx: i64,
+        devicePtr: *mut ::core::ffi::c_void,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGetVector(
+        n: ::core::ffi::c_int,
+        elemSize: ::core::ffi::c_int,
+        x: *const ::core::ffi::c_void,
+        incx: ::core::ffi::c_int,
+        y: *mut ::core::ffi::c_void,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGetVector_64(
+        n: i64,
+        elemSize: i64,
+        x: *const ::core::ffi::c_void,
+        incx: i64,
+        y: *mut ::core::ffi::c_void,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSetMatrix(
+        rows: ::core::ffi::c_int,
+        cols: ::core::ffi::c_int,
+        elemSize: ::core::ffi::c_int,
+        A: *const ::core::ffi::c_void,
+        lda: ::core::ffi::c_int,
+        B: *mut ::core::ffi::c_void,
+        ldb: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSetMatrix_64(
+        rows: i64,
+        cols: i64,
+        elemSize: i64,
+        A: *const ::core::ffi::c_void,
+        lda: i64,
+        B: *mut ::core::ffi::c_void,
+        ldb: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGetMatrix(
+        rows: ::core::ffi::c_int,
+        cols: ::core::ffi::c_int,
+        elemSize: ::core::ffi::c_int,
+        A: *const ::core::ffi::c_void,
+        lda: ::core::ffi::c_int,
+        B: *mut ::core::ffi::c_void,
+        ldb: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGetMatrix_64(
+        rows: i64,
+        cols: i64,
+        elemSize: i64,
+        A: *const ::core::ffi::c_void,
+        lda: i64,
+        B: *mut ::core::ffi::c_void,
+        ldb: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSetVectorAsync(
+        n: ::core::ffi::c_int,
+        elemSize: ::core::ffi::c_int,
+        hostPtr: *const ::core::ffi::c_void,
+        incx: ::core::ffi::c_int,
+        devicePtr: *mut ::core::ffi::c_void,
+        incy: ::core::ffi::c_int,
+        stream: cuda_types::cublas::cudaStream_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSetVectorAsync_64(
+        n: i64,
+        elemSize: i64,
+        hostPtr: *const ::core::ffi::c_void,
+        incx: i64,
+        devicePtr: *mut ::core::ffi::c_void,
+        incy: i64,
+        stream: cuda_types::cublas::cudaStream_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGetVectorAsync(
+        n: ::core::ffi::c_int,
+        elemSize: ::core::ffi::c_int,
+        devicePtr: *const ::core::ffi::c_void,
+        incx: ::core::ffi::c_int,
+        hostPtr: *mut ::core::ffi::c_void,
+        incy: ::core::ffi::c_int,
+        stream: cuda_types::cublas::cudaStream_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGetVectorAsync_64(
+        n: i64,
+        elemSize: i64,
+        devicePtr: *const ::core::ffi::c_void,
+        incx: i64,
+        hostPtr: *mut ::core::ffi::c_void,
+        incy: i64,
+        stream: cuda_types::cublas::cudaStream_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSetMatrixAsync(
+        rows: ::core::ffi::c_int,
+        cols: ::core::ffi::c_int,
+        elemSize: ::core::ffi::c_int,
+        A: *const ::core::ffi::c_void,
+        lda: ::core::ffi::c_int,
+        B: *mut ::core::ffi::c_void,
+        ldb: ::core::ffi::c_int,
+        stream: cuda_types::cublas::cudaStream_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSetMatrixAsync_64(
+        rows: i64,
+        cols: i64,
+        elemSize: i64,
+        A: *const ::core::ffi::c_void,
+        lda: i64,
+        B: *mut ::core::ffi::c_void,
+        ldb: i64,
+        stream: cuda_types::cublas::cudaStream_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGetMatrixAsync(
+        rows: ::core::ffi::c_int,
+        cols: ::core::ffi::c_int,
+        elemSize: ::core::ffi::c_int,
+        A: *const ::core::ffi::c_void,
+        lda: ::core::ffi::c_int,
+        B: *mut ::core::ffi::c_void,
+        ldb: ::core::ffi::c_int,
+        stream: cuda_types::cublas::cudaStream_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGetMatrixAsync_64(
+        rows: i64,
+        cols: i64,
+        elemSize: i64,
+        A: *const ::core::ffi::c_void,
+        lda: i64,
+        B: *mut ::core::ffi::c_void,
+        ldb: i64,
+        stream: cuda_types::cublas::cudaStream_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    fn cublasXerbla(srName: *const ::core::ffi::c_char, info: ::core::ffi::c_int) -> ();
+    #[must_use]
+    fn cublasNrm2Ex(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: ::core::ffi::c_int,
+        result: *mut ::core::ffi::c_void,
+        resultType: cuda_types::cublas::cudaDataType,
+        executionType: cuda_types::cublas::cudaDataType,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasNrm2Ex_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: i64,
+        result: *mut ::core::ffi::c_void,
+        resultType: cuda_types::cublas::cudaDataType,
+        executionType: cuda_types::cublas::cudaDataType,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSnrm2_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const f32,
+        incx: ::core::ffi::c_int,
+        result: *mut f32,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSnrm2_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const f32,
+        incx: i64,
+        result: *mut f32,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDnrm2_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const f64,
+        incx: ::core::ffi::c_int,
+        result: *mut f64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDnrm2_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const f64,
+        incx: i64,
+        result: *mut f64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasScnrm2_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        result: *mut f32,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasScnrm2_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        result: *mut f32,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDznrm2_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        result: *mut f64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDznrm2_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        result: *mut f64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDotEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: ::core::ffi::c_int,
+        y: *const ::core::ffi::c_void,
+        yType: cuda_types::cublas::cudaDataType,
+        incy: ::core::ffi::c_int,
+        result: *mut ::core::ffi::c_void,
+        resultType: cuda_types::cublas::cudaDataType,
+        executionType: cuda_types::cublas::cudaDataType,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDotEx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: i64,
+        y: *const ::core::ffi::c_void,
+        yType: cuda_types::cublas::cudaDataType,
+        incy: i64,
+        result: *mut ::core::ffi::c_void,
+        resultType: cuda_types::cublas::cudaDataType,
+        executionType: cuda_types::cublas::cudaDataType,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDotcEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: ::core::ffi::c_int,
+        y: *const ::core::ffi::c_void,
+        yType: cuda_types::cublas::cudaDataType,
+        incy: ::core::ffi::c_int,
+        result: *mut ::core::ffi::c_void,
+        resultType: cuda_types::cublas::cudaDataType,
+        executionType: cuda_types::cublas::cudaDataType,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDotcEx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: i64,
+        y: *const ::core::ffi::c_void,
+        yType: cuda_types::cublas::cudaDataType,
+        incy: i64,
+        result: *mut ::core::ffi::c_void,
+        resultType: cuda_types::cublas::cudaDataType,
+        executionType: cuda_types::cublas::cudaDataType,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSdot_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const f32,
+        incx: ::core::ffi::c_int,
+        y: *const f32,
+        incy: ::core::ffi::c_int,
+        result: *mut f32,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSdot_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const f32,
+        incx: i64,
+        y: *const f32,
+        incy: i64,
+        result: *mut f32,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDdot_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const f64,
+        incx: ::core::ffi::c_int,
+        y: *const f64,
+        incy: ::core::ffi::c_int,
+        result: *mut f64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDdot_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const f64,
+        incx: i64,
+        y: *const f64,
+        incy: i64,
+        result: *mut f64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCdotu_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        y: *const cuda_types::cublas::cuComplex,
+        incy: ::core::ffi::c_int,
+        result: *mut cuda_types::cublas::cuComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCdotu_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        y: *const cuda_types::cublas::cuComplex,
+        incy: i64,
+        result: *mut cuda_types::cublas::cuComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCdotc_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        y: *const cuda_types::cublas::cuComplex,
+        incy: ::core::ffi::c_int,
+        result: *mut cuda_types::cublas::cuComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCdotc_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        y: *const cuda_types::cublas::cuComplex,
+        incy: i64,
+        result: *mut cuda_types::cublas::cuComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZdotu_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        y: *const cuda_types::cublas::cuDoubleComplex,
+        incy: ::core::ffi::c_int,
+        result: *mut cuda_types::cublas::cuDoubleComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZdotu_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        y: *const cuda_types::cublas::cuDoubleComplex,
+        incy: i64,
+        result: *mut cuda_types::cublas::cuDoubleComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZdotc_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        y: *const cuda_types::cublas::cuDoubleComplex,
+        incy: ::core::ffi::c_int,
+        result: *mut cuda_types::cublas::cuDoubleComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZdotc_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        y: *const cuda_types::cublas::cuDoubleComplex,
+        incy: i64,
+        result: *mut cuda_types::cublas::cuDoubleComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasScalEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        alpha: *const ::core::ffi::c_void,
+        alphaType: cuda_types::cublas::cudaDataType,
+        x: *mut ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: ::core::ffi::c_int,
+        executionType: cuda_types::cublas::cudaDataType,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasScalEx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        alpha: *const ::core::ffi::c_void,
+        alphaType: cuda_types::cublas::cudaDataType,
+        x: *mut ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: i64,
+        executionType: cuda_types::cublas::cudaDataType,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSscal_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        x: *mut f32,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSscal_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        alpha: *const f32,
+        x: *mut f32,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDscal_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        alpha: *const f64,
+        x: *mut f64,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDscal_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        alpha: *const f64,
+        x: *mut f64,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCscal_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        x: *mut cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCscal_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        x: *mut cuda_types::cublas::cuComplex,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCsscal_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        x: *mut cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCsscal_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        alpha: *const f32,
+        x: *mut cuda_types::cublas::cuComplex,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZscal_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        x: *mut cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZscal_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        x: *mut cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZdscal_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        alpha: *const f64,
+        x: *mut cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZdscal_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        alpha: *const f64,
+        x: *mut cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasAxpyEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        alpha: *const ::core::ffi::c_void,
+        alphaType: cuda_types::cublas::cudaDataType,
+        x: *const ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: ::core::ffi::c_int,
+        y: *mut ::core::ffi::c_void,
+        yType: cuda_types::cublas::cudaDataType,
+        incy: ::core::ffi::c_int,
+        executiontype: cuda_types::cublas::cudaDataType,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasAxpyEx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        alpha: *const ::core::ffi::c_void,
+        alphaType: cuda_types::cublas::cudaDataType,
+        x: *const ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: i64,
+        y: *mut ::core::ffi::c_void,
+        yType: cuda_types::cublas::cudaDataType,
+        incy: i64,
+        executiontype: cuda_types::cublas::cudaDataType,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSaxpy_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        x: *const f32,
+        incx: ::core::ffi::c_int,
+        y: *mut f32,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSaxpy_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        alpha: *const f32,
+        x: *const f32,
+        incx: i64,
+        y: *mut f32,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDaxpy_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        alpha: *const f64,
+        x: *const f64,
+        incx: ::core::ffi::c_int,
+        y: *mut f64,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDaxpy_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        alpha: *const f64,
+        x: *const f64,
+        incx: i64,
+        y: *mut f64,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCaxpy_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCaxpy_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZaxpy_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZaxpy_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCopyEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: ::core::ffi::c_int,
+        y: *mut ::core::ffi::c_void,
+        yType: cuda_types::cublas::cudaDataType,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCopyEx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: i64,
+        y: *mut ::core::ffi::c_void,
+        yType: cuda_types::cublas::cudaDataType,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasScopy_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const f32,
+        incx: ::core::ffi::c_int,
+        y: *mut f32,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasScopy_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const f32,
+        incx: i64,
+        y: *mut f32,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDcopy_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const f64,
+        incx: ::core::ffi::c_int,
+        y: *mut f64,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDcopy_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const f64,
+        incx: i64,
+        y: *mut f64,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCcopy_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCcopy_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZcopy_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZcopy_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSswap_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *mut f32,
+        incx: ::core::ffi::c_int,
+        y: *mut f32,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSswap_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *mut f32,
+        incx: i64,
+        y: *mut f32,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDswap_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *mut f64,
+        incx: ::core::ffi::c_int,
+        y: *mut f64,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDswap_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *mut f64,
+        incx: i64,
+        y: *mut f64,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCswap_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *mut cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCswap_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *mut cuda_types::cublas::cuComplex,
+        incx: i64,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZswap_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *mut cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZswap_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *mut cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSwapEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *mut ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: ::core::ffi::c_int,
+        y: *mut ::core::ffi::c_void,
+        yType: cuda_types::cublas::cudaDataType,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSwapEx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *mut ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: i64,
+        y: *mut ::core::ffi::c_void,
+        yType: cuda_types::cublas::cudaDataType,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasIsamax_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const f32,
+        incx: ::core::ffi::c_int,
+        result: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasIsamax_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const f32,
+        incx: i64,
+        result: *mut i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasIdamax_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const f64,
+        incx: ::core::ffi::c_int,
+        result: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasIdamax_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const f64,
+        incx: i64,
+        result: *mut i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasIcamax_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        result: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasIcamax_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        result: *mut i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasIzamax_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        result: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasIzamax_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        result: *mut i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasIamaxEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: ::core::ffi::c_int,
+        result: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasIamaxEx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: i64,
+        result: *mut i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasIsamin_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const f32,
+        incx: ::core::ffi::c_int,
+        result: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasIsamin_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const f32,
+        incx: i64,
+        result: *mut i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasIdamin_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const f64,
+        incx: ::core::ffi::c_int,
+        result: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasIdamin_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const f64,
+        incx: i64,
+        result: *mut i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasIcamin_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        result: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasIcamin_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        result: *mut i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasIzamin_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        result: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasIzamin_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        result: *mut i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasIaminEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: ::core::ffi::c_int,
+        result: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasIaminEx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: i64,
+        result: *mut i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasAsumEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: ::core::ffi::c_int,
+        result: *mut ::core::ffi::c_void,
+        resultType: cuda_types::cublas::cudaDataType,
+        executiontype: cuda_types::cublas::cudaDataType,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasAsumEx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: i64,
+        result: *mut ::core::ffi::c_void,
+        resultType: cuda_types::cublas::cudaDataType,
+        executiontype: cuda_types::cublas::cudaDataType,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSasum_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const f32,
+        incx: ::core::ffi::c_int,
+        result: *mut f32,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSasum_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const f32,
+        incx: i64,
+        result: *mut f32,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDasum_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const f64,
+        incx: ::core::ffi::c_int,
+        result: *mut f64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDasum_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const f64,
+        incx: i64,
+        result: *mut f64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasScasum_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        result: *mut f32,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasScasum_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        result: *mut f32,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDzasum_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        result: *mut f64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDzasum_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        result: *mut f64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSrot_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *mut f32,
+        incx: ::core::ffi::c_int,
+        y: *mut f32,
+        incy: ::core::ffi::c_int,
+        c: *const f32,
+        s: *const f32,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSrot_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *mut f32,
+        incx: i64,
+        y: *mut f32,
+        incy: i64,
+        c: *const f32,
+        s: *const f32,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDrot_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *mut f64,
+        incx: ::core::ffi::c_int,
+        y: *mut f64,
+        incy: ::core::ffi::c_int,
+        c: *const f64,
+        s: *const f64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDrot_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *mut f64,
+        incx: i64,
+        y: *mut f64,
+        incy: i64,
+        c: *const f64,
+        s: *const f64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCrot_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *mut cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: ::core::ffi::c_int,
+        c: *const f32,
+        s: *const cuda_types::cublas::cuComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCrot_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *mut cuda_types::cublas::cuComplex,
+        incx: i64,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: i64,
+        c: *const f32,
+        s: *const cuda_types::cublas::cuComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCsrot_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *mut cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: ::core::ffi::c_int,
+        c: *const f32,
+        s: *const f32,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCsrot_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *mut cuda_types::cublas::cuComplex,
+        incx: i64,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: i64,
+        c: *const f32,
+        s: *const f32,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZrot_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *mut cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: ::core::ffi::c_int,
+        c: *const f64,
+        s: *const cuda_types::cublas::cuDoubleComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZrot_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *mut cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: i64,
+        c: *const f64,
+        s: *const cuda_types::cublas::cuDoubleComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZdrot_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *mut cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: ::core::ffi::c_int,
+        c: *const f64,
+        s: *const f64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZdrot_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *mut cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: i64,
+        c: *const f64,
+        s: *const f64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasRotEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *mut ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: ::core::ffi::c_int,
+        y: *mut ::core::ffi::c_void,
+        yType: cuda_types::cublas::cudaDataType,
+        incy: ::core::ffi::c_int,
+        c: *const ::core::ffi::c_void,
+        s: *const ::core::ffi::c_void,
+        csType: cuda_types::cublas::cudaDataType,
+        executiontype: cuda_types::cublas::cudaDataType,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasRotEx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *mut ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: i64,
+        y: *mut ::core::ffi::c_void,
+        yType: cuda_types::cublas::cudaDataType,
+        incy: i64,
+        c: *const ::core::ffi::c_void,
+        s: *const ::core::ffi::c_void,
+        csType: cuda_types::cublas::cudaDataType,
+        executiontype: cuda_types::cublas::cudaDataType,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSrotg_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        a: *mut f32,
+        b: *mut f32,
+        c: *mut f32,
+        s: *mut f32,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDrotg_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        a: *mut f64,
+        b: *mut f64,
+        c: *mut f64,
+        s: *mut f64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCrotg_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        a: *mut cuda_types::cublas::cuComplex,
+        b: *mut cuda_types::cublas::cuComplex,
+        c: *mut f32,
+        s: *mut cuda_types::cublas::cuComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZrotg_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        a: *mut cuda_types::cublas::cuDoubleComplex,
+        b: *mut cuda_types::cublas::cuDoubleComplex,
+        c: *mut f64,
+        s: *mut cuda_types::cublas::cuDoubleComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasRotgEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        a: *mut ::core::ffi::c_void,
+        b: *mut ::core::ffi::c_void,
+        abType: cuda_types::cublas::cudaDataType,
+        c: *mut ::core::ffi::c_void,
+        s: *mut ::core::ffi::c_void,
+        csType: cuda_types::cublas::cudaDataType,
+        executiontype: cuda_types::cublas::cudaDataType,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSrotm_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *mut f32,
+        incx: ::core::ffi::c_int,
+        y: *mut f32,
+        incy: ::core::ffi::c_int,
+        param: *const f32,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSrotm_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *mut f32,
+        incx: i64,
+        y: *mut f32,
+        incy: i64,
+        param: *const f32,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDrotm_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *mut f64,
+        incx: ::core::ffi::c_int,
+        y: *mut f64,
+        incy: ::core::ffi::c_int,
+        param: *const f64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDrotm_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *mut f64,
+        incx: i64,
+        y: *mut f64,
+        incy: i64,
+        param: *const f64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasRotmEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        x: *mut ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: ::core::ffi::c_int,
+        y: *mut ::core::ffi::c_void,
+        yType: cuda_types::cublas::cudaDataType,
+        incy: ::core::ffi::c_int,
+        param: *const ::core::ffi::c_void,
+        paramType: cuda_types::cublas::cudaDataType,
+        executiontype: cuda_types::cublas::cudaDataType,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasRotmEx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: i64,
+        x: *mut ::core::ffi::c_void,
+        xType: cuda_types::cublas::cudaDataType,
+        incx: i64,
+        y: *mut ::core::ffi::c_void,
+        yType: cuda_types::cublas::cudaDataType,
+        incy: i64,
+        param: *const ::core::ffi::c_void,
+        paramType: cuda_types::cublas::cudaDataType,
+        executiontype: cuda_types::cublas::cudaDataType,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSrotmg_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        d1: *mut f32,
+        d2: *mut f32,
+        x1: *mut f32,
+        y1: *const f32,
+        param: *mut f32,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDrotmg_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        d1: *mut f64,
+        d2: *mut f64,
+        x1: *mut f64,
+        y1: *const f64,
+        param: *mut f64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasRotmgEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        d1: *mut ::core::ffi::c_void,
+        d1Type: cuda_types::cublas::cudaDataType,
+        d2: *mut ::core::ffi::c_void,
+        d2Type: cuda_types::cublas::cudaDataType,
+        x1: *mut ::core::ffi::c_void,
+        x1Type: cuda_types::cublas::cudaDataType,
+        y1: *const ::core::ffi::c_void,
+        y1Type: cuda_types::cublas::cudaDataType,
+        param: *mut ::core::ffi::c_void,
+        paramType: cuda_types::cublas::cudaDataType,
+        executiontype: cuda_types::cublas::cudaDataType,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgemv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        x: *const f32,
+        incx: ::core::ffi::c_int,
+        beta: *const f32,
+        y: *mut f32,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgemv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const f32,
+        A: *const f32,
+        lda: i64,
+        x: *const f32,
+        incx: i64,
+        beta: *const f32,
+        y: *mut f32,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDgemv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f64,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        x: *const f64,
+        incx: ::core::ffi::c_int,
+        beta: *const f64,
+        y: *mut f64,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDgemv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const f64,
+        A: *const f64,
+        lda: i64,
+        x: *const f64,
+        incx: i64,
+        beta: *const f64,
+        y: *mut f64,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgemv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuComplex,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgemv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        beta: *const cuda_types::cublas::cuComplex,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgemv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgemv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgbmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        kl: ::core::ffi::c_int,
+        ku: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        x: *const f32,
+        incx: ::core::ffi::c_int,
+        beta: *const f32,
+        y: *mut f32,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgbmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        kl: i64,
+        ku: i64,
+        alpha: *const f32,
+        A: *const f32,
+        lda: i64,
+        x: *const f32,
+        incx: i64,
+        beta: *const f32,
+        y: *mut f32,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDgbmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        kl: ::core::ffi::c_int,
+        ku: ::core::ffi::c_int,
+        alpha: *const f64,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        x: *const f64,
+        incx: ::core::ffi::c_int,
+        beta: *const f64,
+        y: *mut f64,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDgbmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        kl: i64,
+        ku: i64,
+        alpha: *const f64,
+        A: *const f64,
+        lda: i64,
+        x: *const f64,
+        incx: i64,
+        beta: *const f64,
+        y: *mut f64,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgbmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        kl: ::core::ffi::c_int,
+        ku: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuComplex,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgbmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        kl: i64,
+        ku: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        beta: *const cuda_types::cublas::cuComplex,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgbmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        kl: ::core::ffi::c_int,
+        ku: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgbmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        kl: i64,
+        ku: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasStrmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        x: *mut f32,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasStrmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        A: *const f32,
+        lda: i64,
+        x: *mut f32,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDtrmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        x: *mut f64,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDtrmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        A: *const f64,
+        lda: i64,
+        x: *mut f64,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCtrmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        x: *mut cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCtrmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        x: *mut cuda_types::cublas::cuComplex,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZtrmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        x: *mut cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZtrmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        x: *mut cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasStbmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        x: *mut f32,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasStbmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        k: i64,
+        A: *const f32,
+        lda: i64,
+        x: *mut f32,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDtbmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        x: *mut f64,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDtbmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        k: i64,
+        A: *const f64,
+        lda: i64,
+        x: *mut f64,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCtbmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        x: *mut cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCtbmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        k: i64,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        x: *mut cuda_types::cublas::cuComplex,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZtbmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        x: *mut cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZtbmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        k: i64,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        x: *mut cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasStpmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        AP: *const f32,
+        x: *mut f32,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasStpmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        AP: *const f32,
+        x: *mut f32,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDtpmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        AP: *const f64,
+        x: *mut f64,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDtpmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        AP: *const f64,
+        x: *mut f64,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCtpmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        AP: *const cuda_types::cublas::cuComplex,
+        x: *mut cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCtpmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        AP: *const cuda_types::cublas::cuComplex,
+        x: *mut cuda_types::cublas::cuComplex,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZtpmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        AP: *const cuda_types::cublas::cuDoubleComplex,
+        x: *mut cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZtpmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        AP: *const cuda_types::cublas::cuDoubleComplex,
+        x: *mut cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasStrsv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        x: *mut f32,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasStrsv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        A: *const f32,
+        lda: i64,
+        x: *mut f32,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDtrsv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        x: *mut f64,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDtrsv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        A: *const f64,
+        lda: i64,
+        x: *mut f64,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCtrsv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        x: *mut cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCtrsv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        x: *mut cuda_types::cublas::cuComplex,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZtrsv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        x: *mut cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZtrsv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        x: *mut cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasStpsv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        AP: *const f32,
+        x: *mut f32,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasStpsv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        AP: *const f32,
+        x: *mut f32,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDtpsv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        AP: *const f64,
+        x: *mut f64,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDtpsv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        AP: *const f64,
+        x: *mut f64,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCtpsv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        AP: *const cuda_types::cublas::cuComplex,
+        x: *mut cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCtpsv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        AP: *const cuda_types::cublas::cuComplex,
+        x: *mut cuda_types::cublas::cuComplex,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZtpsv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        AP: *const cuda_types::cublas::cuDoubleComplex,
+        x: *mut cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZtpsv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        AP: *const cuda_types::cublas::cuDoubleComplex,
+        x: *mut cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasStbsv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        x: *mut f32,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasStbsv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        k: i64,
+        A: *const f32,
+        lda: i64,
+        x: *mut f32,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDtbsv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        x: *mut f64,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDtbsv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        k: i64,
+        A: *const f64,
+        lda: i64,
+        x: *mut f64,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCtbsv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        x: *mut cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCtbsv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        k: i64,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        x: *mut cuda_types::cublas::cuComplex,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZtbsv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        x: *mut cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZtbsv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        n: i64,
+        k: i64,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        x: *mut cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSsymv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        x: *const f32,
+        incx: ::core::ffi::c_int,
+        beta: *const f32,
+        y: *mut f32,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSsymv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const f32,
+        A: *const f32,
+        lda: i64,
+        x: *const f32,
+        incx: i64,
+        beta: *const f32,
+        y: *mut f32,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDsymv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const f64,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        x: *const f64,
+        incx: ::core::ffi::c_int,
+        beta: *const f64,
+        y: *mut f64,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDsymv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const f64,
+        A: *const f64,
+        lda: i64,
+        x: *const f64,
+        incx: i64,
+        beta: *const f64,
+        y: *mut f64,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCsymv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuComplex,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCsymv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        beta: *const cuda_types::cublas::cuComplex,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZsymv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZsymv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasChemv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuComplex,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasChemv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        beta: *const cuda_types::cublas::cuComplex,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZhemv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZhemv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSsbmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        x: *const f32,
+        incx: ::core::ffi::c_int,
+        beta: *const f32,
+        y: *mut f32,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSsbmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        k: i64,
+        alpha: *const f32,
+        A: *const f32,
+        lda: i64,
+        x: *const f32,
+        incx: i64,
+        beta: *const f32,
+        y: *mut f32,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDsbmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const f64,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        x: *const f64,
+        incx: ::core::ffi::c_int,
+        beta: *const f64,
+        y: *mut f64,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDsbmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        k: i64,
+        alpha: *const f64,
+        A: *const f64,
+        lda: i64,
+        x: *const f64,
+        incx: i64,
+        beta: *const f64,
+        y: *mut f64,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasChbmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuComplex,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasChbmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        beta: *const cuda_types::cublas::cuComplex,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZhbmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZhbmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSspmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        AP: *const f32,
+        x: *const f32,
+        incx: ::core::ffi::c_int,
+        beta: *const f32,
+        y: *mut f32,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSspmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const f32,
+        AP: *const f32,
+        x: *const f32,
+        incx: i64,
+        beta: *const f32,
+        y: *mut f32,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDspmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const f64,
+        AP: *const f64,
+        x: *const f64,
+        incx: ::core::ffi::c_int,
+        beta: *const f64,
+        y: *mut f64,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDspmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const f64,
+        AP: *const f64,
+        x: *const f64,
+        incx: i64,
+        beta: *const f64,
+        y: *mut f64,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasChpmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        AP: *const cuda_types::cublas::cuComplex,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuComplex,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasChpmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        AP: *const cuda_types::cublas::cuComplex,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        beta: *const cuda_types::cublas::cuComplex,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZhpmv_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        AP: *const cuda_types::cublas::cuDoubleComplex,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZhpmv_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        AP: *const cuda_types::cublas::cuDoubleComplex,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSger_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        x: *const f32,
+        incx: ::core::ffi::c_int,
+        y: *const f32,
+        incy: ::core::ffi::c_int,
+        A: *mut f32,
+        lda: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSger_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        m: i64,
+        n: i64,
+        alpha: *const f32,
+        x: *const f32,
+        incx: i64,
+        y: *const f32,
+        incy: i64,
+        A: *mut f32,
+        lda: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDger_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f64,
+        x: *const f64,
+        incx: ::core::ffi::c_int,
+        y: *const f64,
+        incy: ::core::ffi::c_int,
+        A: *mut f64,
+        lda: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDger_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        m: i64,
+        n: i64,
+        alpha: *const f64,
+        x: *const f64,
+        incx: i64,
+        y: *const f64,
+        incy: i64,
+        A: *mut f64,
+        lda: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgeru_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        y: *const cuda_types::cublas::cuComplex,
+        incy: ::core::ffi::c_int,
+        A: *mut cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgeru_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        m: i64,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        y: *const cuda_types::cublas::cuComplex,
+        incy: i64,
+        A: *mut cuda_types::cublas::cuComplex,
+        lda: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgerc_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        y: *const cuda_types::cublas::cuComplex,
+        incy: ::core::ffi::c_int,
+        A: *mut cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgerc_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        m: i64,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        y: *const cuda_types::cublas::cuComplex,
+        incy: i64,
+        A: *mut cuda_types::cublas::cuComplex,
+        lda: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgeru_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        y: *const cuda_types::cublas::cuDoubleComplex,
+        incy: ::core::ffi::c_int,
+        A: *mut cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgeru_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        m: i64,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        y: *const cuda_types::cublas::cuDoubleComplex,
+        incy: i64,
+        A: *mut cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgerc_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        y: *const cuda_types::cublas::cuDoubleComplex,
+        incy: ::core::ffi::c_int,
+        A: *mut cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgerc_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        m: i64,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        y: *const cuda_types::cublas::cuDoubleComplex,
+        incy: i64,
+        A: *mut cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSsyr_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        x: *const f32,
+        incx: ::core::ffi::c_int,
+        A: *mut f32,
+        lda: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSsyr_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const f32,
+        x: *const f32,
+        incx: i64,
+        A: *mut f32,
+        lda: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDsyr_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const f64,
+        x: *const f64,
+        incx: ::core::ffi::c_int,
+        A: *mut f64,
+        lda: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDsyr_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const f64,
+        x: *const f64,
+        incx: i64,
+        A: *mut f64,
+        lda: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCsyr_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        A: *mut cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCsyr_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        A: *mut cuda_types::cublas::cuComplex,
+        lda: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZsyr_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        A: *mut cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZsyr_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        A: *mut cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCher_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        A: *mut cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCher_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const f32,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        A: *mut cuda_types::cublas::cuComplex,
+        lda: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZher_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const f64,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        A: *mut cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZher_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const f64,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        A: *mut cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSspr_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        x: *const f32,
+        incx: ::core::ffi::c_int,
+        AP: *mut f32,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSspr_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const f32,
+        x: *const f32,
+        incx: i64,
+        AP: *mut f32,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDspr_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const f64,
+        x: *const f64,
+        incx: ::core::ffi::c_int,
+        AP: *mut f64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDspr_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const f64,
+        x: *const f64,
+        incx: i64,
+        AP: *mut f64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasChpr_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        AP: *mut cuda_types::cublas::cuComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasChpr_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const f32,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        AP: *mut cuda_types::cublas::cuComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZhpr_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const f64,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        AP: *mut cuda_types::cublas::cuDoubleComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZhpr_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const f64,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        AP: *mut cuda_types::cublas::cuDoubleComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSsyr2_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        x: *const f32,
+        incx: ::core::ffi::c_int,
+        y: *const f32,
+        incy: ::core::ffi::c_int,
+        A: *mut f32,
+        lda: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSsyr2_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const f32,
+        x: *const f32,
+        incx: i64,
+        y: *const f32,
+        incy: i64,
+        A: *mut f32,
+        lda: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDsyr2_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const f64,
+        x: *const f64,
+        incx: ::core::ffi::c_int,
+        y: *const f64,
+        incy: ::core::ffi::c_int,
+        A: *mut f64,
+        lda: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDsyr2_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const f64,
+        x: *const f64,
+        incx: i64,
+        y: *const f64,
+        incy: i64,
+        A: *mut f64,
+        lda: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCsyr2_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        y: *const cuda_types::cublas::cuComplex,
+        incy: ::core::ffi::c_int,
+        A: *mut cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCsyr2_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        y: *const cuda_types::cublas::cuComplex,
+        incy: i64,
+        A: *mut cuda_types::cublas::cuComplex,
+        lda: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZsyr2_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        y: *const cuda_types::cublas::cuDoubleComplex,
+        incy: ::core::ffi::c_int,
+        A: *mut cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZsyr2_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        y: *const cuda_types::cublas::cuDoubleComplex,
+        incy: i64,
+        A: *mut cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCher2_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        y: *const cuda_types::cublas::cuComplex,
+        incy: ::core::ffi::c_int,
+        A: *mut cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCher2_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        y: *const cuda_types::cublas::cuComplex,
+        incy: i64,
+        A: *mut cuda_types::cublas::cuComplex,
+        lda: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZher2_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        y: *const cuda_types::cublas::cuDoubleComplex,
+        incy: ::core::ffi::c_int,
+        A: *mut cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZher2_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        y: *const cuda_types::cublas::cuDoubleComplex,
+        incy: i64,
+        A: *mut cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSspr2_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        x: *const f32,
+        incx: ::core::ffi::c_int,
+        y: *const f32,
+        incy: ::core::ffi::c_int,
+        AP: *mut f32,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSspr2_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const f32,
+        x: *const f32,
+        incx: i64,
+        y: *const f32,
+        incy: i64,
+        AP: *mut f32,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDspr2_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const f64,
+        x: *const f64,
+        incx: ::core::ffi::c_int,
+        y: *const f64,
+        incy: ::core::ffi::c_int,
+        AP: *mut f64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDspr2_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const f64,
+        x: *const f64,
+        incx: i64,
+        y: *const f64,
+        incy: i64,
+        AP: *mut f64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasChpr2_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        y: *const cuda_types::cublas::cuComplex,
+        incy: ::core::ffi::c_int,
+        AP: *mut cuda_types::cublas::cuComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasChpr2_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        y: *const cuda_types::cublas::cuComplex,
+        incy: i64,
+        AP: *mut cuda_types::cublas::cuComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZhpr2_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        y: *const cuda_types::cublas::cuDoubleComplex,
+        incy: ::core::ffi::c_int,
+        AP: *mut cuda_types::cublas::cuDoubleComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZhpr2_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        y: *const cuda_types::cublas::cuDoubleComplex,
+        incy: i64,
+        AP: *mut cuda_types::cublas::cuDoubleComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgemvBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        Aarray: *const *const f32,
+        lda: ::core::ffi::c_int,
+        xarray: *const *const f32,
+        incx: ::core::ffi::c_int,
+        beta: *const f32,
+        yarray: *const *mut f32,
+        incy: ::core::ffi::c_int,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgemvBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const f32,
+        Aarray: *const *const f32,
+        lda: i64,
+        xarray: *const *const f32,
+        incx: i64,
+        beta: *const f32,
+        yarray: *const *mut f32,
+        incy: i64,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDgemvBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f64,
+        Aarray: *const *const f64,
+        lda: ::core::ffi::c_int,
+        xarray: *const *const f64,
+        incx: ::core::ffi::c_int,
+        beta: *const f64,
+        yarray: *const *mut f64,
+        incy: ::core::ffi::c_int,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDgemvBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const f64,
+        Aarray: *const *const f64,
+        lda: i64,
+        xarray: *const *const f64,
+        incx: i64,
+        beta: *const f64,
+        yarray: *const *mut f64,
+        incy: i64,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgemvBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        Aarray: *const *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        xarray: *const *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuComplex,
+        yarray: *const *mut cuda_types::cublas::cuComplex,
+        incy: ::core::ffi::c_int,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgemvBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        Aarray: *const *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        xarray: *const *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        beta: *const cuda_types::cublas::cuComplex,
+        yarray: *const *mut cuda_types::cublas::cuComplex,
+        incy: i64,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgemvBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        Aarray: *const *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        xarray: *const *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        yarray: *const *mut cuda_types::cublas::cuDoubleComplex,
+        incy: ::core::ffi::c_int,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgemvBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        Aarray: *const *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        xarray: *const *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        yarray: *const *mut cuda_types::cublas::cuDoubleComplex,
+        incy: i64,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasHSHgemvBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        Aarray: *const *const cuda_types::cublas::__half,
+        lda: ::core::ffi::c_int,
+        xarray: *const *const cuda_types::cublas::__half,
+        incx: ::core::ffi::c_int,
+        beta: *const f32,
+        yarray: *const *mut cuda_types::cublas::__half,
+        incy: ::core::ffi::c_int,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasHSHgemvBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const f32,
+        Aarray: *const *const cuda_types::cublas::__half,
+        lda: i64,
+        xarray: *const *const cuda_types::cublas::__half,
+        incx: i64,
+        beta: *const f32,
+        yarray: *const *mut cuda_types::cublas::__half,
+        incy: i64,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasHSSgemvBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        Aarray: *const *const cuda_types::cublas::__half,
+        lda: ::core::ffi::c_int,
+        xarray: *const *const cuda_types::cublas::__half,
+        incx: ::core::ffi::c_int,
+        beta: *const f32,
+        yarray: *const *mut f32,
+        incy: ::core::ffi::c_int,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasHSSgemvBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const f32,
+        Aarray: *const *const cuda_types::cublas::__half,
+        lda: i64,
+        xarray: *const *const cuda_types::cublas::__half,
+        incx: i64,
+        beta: *const f32,
+        yarray: *const *mut f32,
+        incy: i64,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasTSTgemvBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        Aarray: *const *const cuda_types::cublas::__nv_bfloat16,
+        lda: ::core::ffi::c_int,
+        xarray: *const *const cuda_types::cublas::__nv_bfloat16,
+        incx: ::core::ffi::c_int,
+        beta: *const f32,
+        yarray: *const *mut cuda_types::cublas::__nv_bfloat16,
+        incy: ::core::ffi::c_int,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasTSTgemvBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const f32,
+        Aarray: *const *const cuda_types::cublas::__nv_bfloat16,
+        lda: i64,
+        xarray: *const *const cuda_types::cublas::__nv_bfloat16,
+        incx: i64,
+        beta: *const f32,
+        yarray: *const *mut cuda_types::cublas::__nv_bfloat16,
+        incy: i64,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasTSSgemvBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        Aarray: *const *const cuda_types::cublas::__nv_bfloat16,
+        lda: ::core::ffi::c_int,
+        xarray: *const *const cuda_types::cublas::__nv_bfloat16,
+        incx: ::core::ffi::c_int,
+        beta: *const f32,
+        yarray: *const *mut f32,
+        incy: ::core::ffi::c_int,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasTSSgemvBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const f32,
+        Aarray: *const *const cuda_types::cublas::__nv_bfloat16,
+        lda: i64,
+        xarray: *const *const cuda_types::cublas::__nv_bfloat16,
+        incx: i64,
+        beta: *const f32,
+        yarray: *const *mut f32,
+        incy: i64,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgemvStridedBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        strideA: ::core::ffi::c_longlong,
+        x: *const f32,
+        incx: ::core::ffi::c_int,
+        stridex: ::core::ffi::c_longlong,
+        beta: *const f32,
+        y: *mut f32,
+        incy: ::core::ffi::c_int,
+        stridey: ::core::ffi::c_longlong,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgemvStridedBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const f32,
+        A: *const f32,
+        lda: i64,
+        strideA: ::core::ffi::c_longlong,
+        x: *const f32,
+        incx: i64,
+        stridex: ::core::ffi::c_longlong,
+        beta: *const f32,
+        y: *mut f32,
+        incy: i64,
+        stridey: ::core::ffi::c_longlong,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDgemvStridedBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f64,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        strideA: ::core::ffi::c_longlong,
+        x: *const f64,
+        incx: ::core::ffi::c_int,
+        stridex: ::core::ffi::c_longlong,
+        beta: *const f64,
+        y: *mut f64,
+        incy: ::core::ffi::c_int,
+        stridey: ::core::ffi::c_longlong,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDgemvStridedBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const f64,
+        A: *const f64,
+        lda: i64,
+        strideA: ::core::ffi::c_longlong,
+        x: *const f64,
+        incx: i64,
+        stridex: ::core::ffi::c_longlong,
+        beta: *const f64,
+        y: *mut f64,
+        incy: i64,
+        stridey: ::core::ffi::c_longlong,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgemvStridedBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        strideA: ::core::ffi::c_longlong,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        stridex: ::core::ffi::c_longlong,
+        beta: *const cuda_types::cublas::cuComplex,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: ::core::ffi::c_int,
+        stridey: ::core::ffi::c_longlong,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgemvStridedBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        strideA: ::core::ffi::c_longlong,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        stridex: ::core::ffi::c_longlong,
+        beta: *const cuda_types::cublas::cuComplex,
+        y: *mut cuda_types::cublas::cuComplex,
+        incy: i64,
+        stridey: ::core::ffi::c_longlong,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgemvStridedBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        strideA: ::core::ffi::c_longlong,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        stridex: ::core::ffi::c_longlong,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: ::core::ffi::c_int,
+        stridey: ::core::ffi::c_longlong,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgemvStridedBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        strideA: ::core::ffi::c_longlong,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        stridex: ::core::ffi::c_longlong,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        y: *mut cuda_types::cublas::cuDoubleComplex,
+        incy: i64,
+        stridey: ::core::ffi::c_longlong,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasHSHgemvStridedBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const cuda_types::cublas::__half,
+        lda: ::core::ffi::c_int,
+        strideA: ::core::ffi::c_longlong,
+        x: *const cuda_types::cublas::__half,
+        incx: ::core::ffi::c_int,
+        stridex: ::core::ffi::c_longlong,
+        beta: *const f32,
+        y: *mut cuda_types::cublas::__half,
+        incy: ::core::ffi::c_int,
+        stridey: ::core::ffi::c_longlong,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasHSHgemvStridedBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const f32,
+        A: *const cuda_types::cublas::__half,
+        lda: i64,
+        strideA: ::core::ffi::c_longlong,
+        x: *const cuda_types::cublas::__half,
+        incx: i64,
+        stridex: ::core::ffi::c_longlong,
+        beta: *const f32,
+        y: *mut cuda_types::cublas::__half,
+        incy: i64,
+        stridey: ::core::ffi::c_longlong,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasHSSgemvStridedBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const cuda_types::cublas::__half,
+        lda: ::core::ffi::c_int,
+        strideA: ::core::ffi::c_longlong,
+        x: *const cuda_types::cublas::__half,
+        incx: ::core::ffi::c_int,
+        stridex: ::core::ffi::c_longlong,
+        beta: *const f32,
+        y: *mut f32,
+        incy: ::core::ffi::c_int,
+        stridey: ::core::ffi::c_longlong,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasHSSgemvStridedBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const f32,
+        A: *const cuda_types::cublas::__half,
+        lda: i64,
+        strideA: ::core::ffi::c_longlong,
+        x: *const cuda_types::cublas::__half,
+        incx: i64,
+        stridex: ::core::ffi::c_longlong,
+        beta: *const f32,
+        y: *mut f32,
+        incy: i64,
+        stridey: ::core::ffi::c_longlong,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasTSTgemvStridedBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const cuda_types::cublas::__nv_bfloat16,
+        lda: ::core::ffi::c_int,
+        strideA: ::core::ffi::c_longlong,
+        x: *const cuda_types::cublas::__nv_bfloat16,
+        incx: ::core::ffi::c_int,
+        stridex: ::core::ffi::c_longlong,
+        beta: *const f32,
+        y: *mut cuda_types::cublas::__nv_bfloat16,
+        incy: ::core::ffi::c_int,
+        stridey: ::core::ffi::c_longlong,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasTSTgemvStridedBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const f32,
+        A: *const cuda_types::cublas::__nv_bfloat16,
+        lda: i64,
+        strideA: ::core::ffi::c_longlong,
+        x: *const cuda_types::cublas::__nv_bfloat16,
+        incx: i64,
+        stridex: ::core::ffi::c_longlong,
+        beta: *const f32,
+        y: *mut cuda_types::cublas::__nv_bfloat16,
+        incy: i64,
+        stridey: ::core::ffi::c_longlong,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasTSSgemvStridedBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const cuda_types::cublas::__nv_bfloat16,
+        lda: ::core::ffi::c_int,
+        strideA: ::core::ffi::c_longlong,
+        x: *const cuda_types::cublas::__nv_bfloat16,
+        incx: ::core::ffi::c_int,
+        stridex: ::core::ffi::c_longlong,
+        beta: *const f32,
+        y: *mut f32,
+        incy: ::core::ffi::c_int,
+        stridey: ::core::ffi::c_longlong,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasTSSgemvStridedBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const f32,
+        A: *const cuda_types::cublas::__nv_bfloat16,
+        lda: i64,
+        strideA: ::core::ffi::c_longlong,
+        x: *const cuda_types::cublas::__nv_bfloat16,
+        incx: i64,
+        stridex: ::core::ffi::c_longlong,
+        beta: *const f32,
+        y: *mut f32,
+        incy: i64,
+        stridey: ::core::ffi::c_longlong,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgemm_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        B: *const f32,
+        ldb: ::core::ffi::c_int,
+        beta: *const f32,
+        C: *mut f32,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgemm_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const f32,
+        A: *const f32,
+        lda: i64,
+        B: *const f32,
+        ldb: i64,
+        beta: *const f32,
+        C: *mut f32,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDgemm_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const f64,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        B: *const f64,
+        ldb: ::core::ffi::c_int,
+        beta: *const f64,
+        C: *mut f64,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDgemm_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const f64,
+        A: *const f64,
+        lda: i64,
+        B: *const f64,
+        ldb: i64,
+        beta: *const f64,
+        C: *mut f64,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgemm_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgemm_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: i64,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgemm3m(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgemm3m_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: i64,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgemm3mEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const ::core::ffi::c_void,
+        Atype: cuda_types::cublas::cudaDataType,
+        lda: ::core::ffi::c_int,
+        B: *const ::core::ffi::c_void,
+        Btype: cuda_types::cublas::cudaDataType,
+        ldb: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut ::core::ffi::c_void,
+        Ctype: cuda_types::cublas::cudaDataType,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgemm3mEx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const ::core::ffi::c_void,
+        Atype: cuda_types::cublas::cudaDataType,
+        lda: i64,
+        B: *const ::core::ffi::c_void,
+        Btype: cuda_types::cublas::cudaDataType,
+        ldb: i64,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut ::core::ffi::c_void,
+        Ctype: cuda_types::cublas::cudaDataType,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgemm_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        B: *const cuda_types::cublas::cuDoubleComplex,
+        ldb: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgemm_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        B: *const cuda_types::cublas::cuDoubleComplex,
+        ldb: i64,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgemm3m(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        B: *const cuda_types::cublas::cuDoubleComplex,
+        ldb: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgemm3m_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        B: *const cuda_types::cublas::cuDoubleComplex,
+        ldb: i64,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasHgemm(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::__half,
+        A: *const cuda_types::cublas::__half,
+        lda: ::core::ffi::c_int,
+        B: *const cuda_types::cublas::__half,
+        ldb: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::__half,
+        C: *mut cuda_types::cublas::__half,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasHgemm_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::__half,
+        A: *const cuda_types::cublas::__half,
+        lda: i64,
+        B: *const cuda_types::cublas::__half,
+        ldb: i64,
+        beta: *const cuda_types::cublas::__half,
+        C: *mut cuda_types::cublas::__half,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgemmEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const ::core::ffi::c_void,
+        Atype: cuda_types::cublas::cudaDataType,
+        lda: ::core::ffi::c_int,
+        B: *const ::core::ffi::c_void,
+        Btype: cuda_types::cublas::cudaDataType,
+        ldb: ::core::ffi::c_int,
+        beta: *const f32,
+        C: *mut ::core::ffi::c_void,
+        Ctype: cuda_types::cublas::cudaDataType,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgemmEx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const f32,
+        A: *const ::core::ffi::c_void,
+        Atype: cuda_types::cublas::cudaDataType,
+        lda: i64,
+        B: *const ::core::ffi::c_void,
+        Btype: cuda_types::cublas::cudaDataType,
+        ldb: i64,
+        beta: *const f32,
+        C: *mut ::core::ffi::c_void,
+        Ctype: cuda_types::cublas::cudaDataType,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGemmEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const ::core::ffi::c_void,
+        A: *const ::core::ffi::c_void,
+        Atype: cuda_types::cublas::cudaDataType,
+        lda: ::core::ffi::c_int,
+        B: *const ::core::ffi::c_void,
+        Btype: cuda_types::cublas::cudaDataType,
+        ldb: ::core::ffi::c_int,
+        beta: *const ::core::ffi::c_void,
+        C: *mut ::core::ffi::c_void,
+        Ctype: cuda_types::cublas::cudaDataType,
+        ldc: ::core::ffi::c_int,
+        computeType: cuda_types::cublas::cublasComputeType_t,
+        algo: cuda_types::cublas::cublasGemmAlgo_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGemmEx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const ::core::ffi::c_void,
+        A: *const ::core::ffi::c_void,
+        Atype: cuda_types::cublas::cudaDataType,
+        lda: i64,
+        B: *const ::core::ffi::c_void,
+        Btype: cuda_types::cublas::cudaDataType,
+        ldb: i64,
+        beta: *const ::core::ffi::c_void,
+        C: *mut ::core::ffi::c_void,
+        Ctype: cuda_types::cublas::cudaDataType,
+        ldc: i64,
+        computeType: cuda_types::cublas::cublasComputeType_t,
+        algo: cuda_types::cublas::cublasGemmAlgo_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgemmEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const ::core::ffi::c_void,
+        Atype: cuda_types::cublas::cudaDataType,
+        lda: ::core::ffi::c_int,
+        B: *const ::core::ffi::c_void,
+        Btype: cuda_types::cublas::cudaDataType,
+        ldb: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut ::core::ffi::c_void,
+        Ctype: cuda_types::cublas::cudaDataType,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgemmEx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const ::core::ffi::c_void,
+        Atype: cuda_types::cublas::cudaDataType,
+        lda: i64,
+        B: *const ::core::ffi::c_void,
+        Btype: cuda_types::cublas::cudaDataType,
+        ldb: i64,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut ::core::ffi::c_void,
+        Ctype: cuda_types::cublas::cudaDataType,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSsyrk_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        beta: *const f32,
+        C: *mut f32,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSsyrk_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: i64,
+        k: i64,
+        alpha: *const f32,
+        A: *const f32,
+        lda: i64,
+        beta: *const f32,
+        C: *mut f32,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDsyrk_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const f64,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        beta: *const f64,
+        C: *mut f64,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDsyrk_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: i64,
+        k: i64,
+        alpha: *const f64,
+        A: *const f64,
+        lda: i64,
+        beta: *const f64,
+        C: *mut f64,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCsyrk_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCsyrk_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZsyrk_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZsyrk_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCsyrkEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const ::core::ffi::c_void,
+        Atype: cuda_types::cublas::cudaDataType,
+        lda: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut ::core::ffi::c_void,
+        Ctype: cuda_types::cublas::cudaDataType,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCsyrkEx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const ::core::ffi::c_void,
+        Atype: cuda_types::cublas::cudaDataType,
+        lda: i64,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut ::core::ffi::c_void,
+        Ctype: cuda_types::cublas::cudaDataType,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCsyrk3mEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const ::core::ffi::c_void,
+        Atype: cuda_types::cublas::cudaDataType,
+        lda: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut ::core::ffi::c_void,
+        Ctype: cuda_types::cublas::cudaDataType,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCsyrk3mEx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const ::core::ffi::c_void,
+        Atype: cuda_types::cublas::cudaDataType,
+        lda: i64,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut ::core::ffi::c_void,
+        Ctype: cuda_types::cublas::cudaDataType,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCherk_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        beta: *const f32,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCherk_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: i64,
+        k: i64,
+        alpha: *const f32,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        beta: *const f32,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZherk_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const f64,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        beta: *const f64,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZherk_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: i64,
+        k: i64,
+        alpha: *const f64,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        beta: *const f64,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCherkEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const ::core::ffi::c_void,
+        Atype: cuda_types::cublas::cudaDataType,
+        lda: ::core::ffi::c_int,
+        beta: *const f32,
+        C: *mut ::core::ffi::c_void,
+        Ctype: cuda_types::cublas::cudaDataType,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCherkEx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: i64,
+        k: i64,
+        alpha: *const f32,
+        A: *const ::core::ffi::c_void,
+        Atype: cuda_types::cublas::cudaDataType,
+        lda: i64,
+        beta: *const f32,
+        C: *mut ::core::ffi::c_void,
+        Ctype: cuda_types::cublas::cudaDataType,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCherk3mEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const ::core::ffi::c_void,
+        Atype: cuda_types::cublas::cudaDataType,
+        lda: ::core::ffi::c_int,
+        beta: *const f32,
+        C: *mut ::core::ffi::c_void,
+        Ctype: cuda_types::cublas::cudaDataType,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCherk3mEx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: i64,
+        k: i64,
+        alpha: *const f32,
+        A: *const ::core::ffi::c_void,
+        Atype: cuda_types::cublas::cudaDataType,
+        lda: i64,
+        beta: *const f32,
+        C: *mut ::core::ffi::c_void,
+        Ctype: cuda_types::cublas::cudaDataType,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSsyr2k_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        B: *const f32,
+        ldb: ::core::ffi::c_int,
+        beta: *const f32,
+        C: *mut f32,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSsyr2k_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: i64,
+        k: i64,
+        alpha: *const f32,
+        A: *const f32,
+        lda: i64,
+        B: *const f32,
+        ldb: i64,
+        beta: *const f32,
+        C: *mut f32,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDsyr2k_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const f64,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        B: *const f64,
+        ldb: ::core::ffi::c_int,
+        beta: *const f64,
+        C: *mut f64,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDsyr2k_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: i64,
+        k: i64,
+        alpha: *const f64,
+        A: *const f64,
+        lda: i64,
+        B: *const f64,
+        ldb: i64,
+        beta: *const f64,
+        C: *mut f64,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCsyr2k_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCsyr2k_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: i64,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZsyr2k_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        B: *const cuda_types::cublas::cuDoubleComplex,
+        ldb: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZsyr2k_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        B: *const cuda_types::cublas::cuDoubleComplex,
+        ldb: i64,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCher2k_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: ::core::ffi::c_int,
+        beta: *const f32,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCher2k_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: i64,
+        beta: *const f32,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZher2k_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        B: *const cuda_types::cublas::cuDoubleComplex,
+        ldb: ::core::ffi::c_int,
+        beta: *const f64,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZher2k_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        B: *const cuda_types::cublas::cuDoubleComplex,
+        ldb: i64,
+        beta: *const f64,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSsyrkx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        B: *const f32,
+        ldb: ::core::ffi::c_int,
+        beta: *const f32,
+        C: *mut f32,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSsyrkx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: i64,
+        k: i64,
+        alpha: *const f32,
+        A: *const f32,
+        lda: i64,
+        B: *const f32,
+        ldb: i64,
+        beta: *const f32,
+        C: *mut f32,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDsyrkx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const f64,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        B: *const f64,
+        ldb: ::core::ffi::c_int,
+        beta: *const f64,
+        C: *mut f64,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDsyrkx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: i64,
+        k: i64,
+        alpha: *const f64,
+        A: *const f64,
+        lda: i64,
+        B: *const f64,
+        ldb: i64,
+        beta: *const f64,
+        C: *mut f64,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCsyrkx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCsyrkx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: i64,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZsyrkx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        B: *const cuda_types::cublas::cuDoubleComplex,
+        ldb: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZsyrkx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        B: *const cuda_types::cublas::cuDoubleComplex,
+        ldb: i64,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCherkx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: ::core::ffi::c_int,
+        beta: *const f32,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCherkx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: i64,
+        beta: *const f32,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZherkx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        B: *const cuda_types::cublas::cuDoubleComplex,
+        ldb: ::core::ffi::c_int,
+        beta: *const f64,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZherkx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        B: *const cuda_types::cublas::cuDoubleComplex,
+        ldb: i64,
+        beta: *const f64,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSsymm_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        B: *const f32,
+        ldb: ::core::ffi::c_int,
+        beta: *const f32,
+        C: *mut f32,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSsymm_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        m: i64,
+        n: i64,
+        alpha: *const f32,
+        A: *const f32,
+        lda: i64,
+        B: *const f32,
+        ldb: i64,
+        beta: *const f32,
+        C: *mut f32,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDsymm_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f64,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        B: *const f64,
+        ldb: ::core::ffi::c_int,
+        beta: *const f64,
+        C: *mut f64,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDsymm_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        m: i64,
+        n: i64,
+        alpha: *const f64,
+        A: *const f64,
+        lda: i64,
+        B: *const f64,
+        ldb: i64,
+        beta: *const f64,
+        C: *mut f64,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCsymm_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCsymm_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        m: i64,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: i64,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZsymm_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        B: *const cuda_types::cublas::cuDoubleComplex,
+        ldb: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZsymm_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        m: i64,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        B: *const cuda_types::cublas::cuDoubleComplex,
+        ldb: i64,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasChemm_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasChemm_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        m: i64,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: i64,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZhemm_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        B: *const cuda_types::cublas::cuDoubleComplex,
+        ldb: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZhemm_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        m: i64,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        B: *const cuda_types::cublas::cuDoubleComplex,
+        ldb: i64,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasStrsm_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        B: *mut f32,
+        ldb: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasStrsm_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: i64,
+        n: i64,
+        alpha: *const f32,
+        A: *const f32,
+        lda: i64,
+        B: *mut f32,
+        ldb: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDtrsm_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f64,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        B: *mut f64,
+        ldb: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDtrsm_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: i64,
+        n: i64,
+        alpha: *const f64,
+        A: *const f64,
+        lda: i64,
+        B: *mut f64,
+        ldb: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCtrsm_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        B: *mut cuda_types::cublas::cuComplex,
+        ldb: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCtrsm_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: i64,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        B: *mut cuda_types::cublas::cuComplex,
+        ldb: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZtrsm_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        B: *mut cuda_types::cublas::cuDoubleComplex,
+        ldb: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZtrsm_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: i64,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        B: *mut cuda_types::cublas::cuDoubleComplex,
+        ldb: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasStrmm_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        B: *const f32,
+        ldb: ::core::ffi::c_int,
+        C: *mut f32,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasStrmm_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: i64,
+        n: i64,
+        alpha: *const f32,
+        A: *const f32,
+        lda: i64,
+        B: *const f32,
+        ldb: i64,
+        C: *mut f32,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDtrmm_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f64,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        B: *const f64,
+        ldb: ::core::ffi::c_int,
+        C: *mut f64,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDtrmm_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: i64,
+        n: i64,
+        alpha: *const f64,
+        A: *const f64,
+        lda: i64,
+        B: *const f64,
+        ldb: i64,
+        C: *mut f64,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCtrmm_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: ::core::ffi::c_int,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCtrmm_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: i64,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: i64,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZtrmm_v2(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        B: *const cuda_types::cublas::cuDoubleComplex,
+        ldb: ::core::ffi::c_int,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZtrmm_v2_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: i64,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        B: *const cuda_types::cublas::cuDoubleComplex,
+        ldb: i64,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasHgemmBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::__half,
+        Aarray: *const *const cuda_types::cublas::__half,
+        lda: ::core::ffi::c_int,
+        Barray: *const *const cuda_types::cublas::__half,
+        ldb: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::__half,
+        Carray: *const *mut cuda_types::cublas::__half,
+        ldc: ::core::ffi::c_int,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasHgemmBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::__half,
+        Aarray: *const *const cuda_types::cublas::__half,
+        lda: i64,
+        Barray: *const *const cuda_types::cublas::__half,
+        ldb: i64,
+        beta: *const cuda_types::cublas::__half,
+        Carray: *const *mut cuda_types::cublas::__half,
+        ldc: i64,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgemmBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const f32,
+        Aarray: *const *const f32,
+        lda: ::core::ffi::c_int,
+        Barray: *const *const f32,
+        ldb: ::core::ffi::c_int,
+        beta: *const f32,
+        Carray: *const *mut f32,
+        ldc: ::core::ffi::c_int,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgemmBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const f32,
+        Aarray: *const *const f32,
+        lda: i64,
+        Barray: *const *const f32,
+        ldb: i64,
+        beta: *const f32,
+        Carray: *const *mut f32,
+        ldc: i64,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDgemmBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const f64,
+        Aarray: *const *const f64,
+        lda: ::core::ffi::c_int,
+        Barray: *const *const f64,
+        ldb: ::core::ffi::c_int,
+        beta: *const f64,
+        Carray: *const *mut f64,
+        ldc: ::core::ffi::c_int,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDgemmBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const f64,
+        Aarray: *const *const f64,
+        lda: i64,
+        Barray: *const *const f64,
+        ldb: i64,
+        beta: *const f64,
+        Carray: *const *mut f64,
+        ldc: i64,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgemmBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        Aarray: *const *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        Barray: *const *const cuda_types::cublas::cuComplex,
+        ldb: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuComplex,
+        Carray: *const *mut cuda_types::cublas::cuComplex,
+        ldc: ::core::ffi::c_int,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgemmBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        Aarray: *const *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        Barray: *const *const cuda_types::cublas::cuComplex,
+        ldb: i64,
+        beta: *const cuda_types::cublas::cuComplex,
+        Carray: *const *mut cuda_types::cublas::cuComplex,
+        ldc: i64,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgemm3mBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        Aarray: *const *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        Barray: *const *const cuda_types::cublas::cuComplex,
+        ldb: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuComplex,
+        Carray: *const *mut cuda_types::cublas::cuComplex,
+        ldc: ::core::ffi::c_int,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgemm3mBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        Aarray: *const *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        Barray: *const *const cuda_types::cublas::cuComplex,
+        ldb: i64,
+        beta: *const cuda_types::cublas::cuComplex,
+        Carray: *const *mut cuda_types::cublas::cuComplex,
+        ldc: i64,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgemmBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        Aarray: *const *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        Barray: *const *const cuda_types::cublas::cuDoubleComplex,
+        ldb: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        Carray: *const *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: ::core::ffi::c_int,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgemmBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        Aarray: *const *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        Barray: *const *const cuda_types::cublas::cuDoubleComplex,
+        ldb: i64,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        Carray: *const *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: i64,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasHgemmStridedBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::__half,
+        A: *const cuda_types::cublas::__half,
+        lda: ::core::ffi::c_int,
+        strideA: ::core::ffi::c_longlong,
+        B: *const cuda_types::cublas::__half,
+        ldb: ::core::ffi::c_int,
+        strideB: ::core::ffi::c_longlong,
+        beta: *const cuda_types::cublas::__half,
+        C: *mut cuda_types::cublas::__half,
+        ldc: ::core::ffi::c_int,
+        strideC: ::core::ffi::c_longlong,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasHgemmStridedBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::__half,
+        A: *const cuda_types::cublas::__half,
+        lda: i64,
+        strideA: ::core::ffi::c_longlong,
+        B: *const cuda_types::cublas::__half,
+        ldb: i64,
+        strideB: ::core::ffi::c_longlong,
+        beta: *const cuda_types::cublas::__half,
+        C: *mut cuda_types::cublas::__half,
+        ldc: i64,
+        strideC: ::core::ffi::c_longlong,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgemmStridedBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        strideA: ::core::ffi::c_longlong,
+        B: *const f32,
+        ldb: ::core::ffi::c_int,
+        strideB: ::core::ffi::c_longlong,
+        beta: *const f32,
+        C: *mut f32,
+        ldc: ::core::ffi::c_int,
+        strideC: ::core::ffi::c_longlong,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgemmStridedBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const f32,
+        A: *const f32,
+        lda: i64,
+        strideA: ::core::ffi::c_longlong,
+        B: *const f32,
+        ldb: i64,
+        strideB: ::core::ffi::c_longlong,
+        beta: *const f32,
+        C: *mut f32,
+        ldc: i64,
+        strideC: ::core::ffi::c_longlong,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDgemmStridedBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const f64,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        strideA: ::core::ffi::c_longlong,
+        B: *const f64,
+        ldb: ::core::ffi::c_int,
+        strideB: ::core::ffi::c_longlong,
+        beta: *const f64,
+        C: *mut f64,
+        ldc: ::core::ffi::c_int,
+        strideC: ::core::ffi::c_longlong,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDgemmStridedBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const f64,
+        A: *const f64,
+        lda: i64,
+        strideA: ::core::ffi::c_longlong,
+        B: *const f64,
+        ldb: i64,
+        strideB: ::core::ffi::c_longlong,
+        beta: *const f64,
+        C: *mut f64,
+        ldc: i64,
+        strideC: ::core::ffi::c_longlong,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgemmStridedBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        strideA: ::core::ffi::c_longlong,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: ::core::ffi::c_int,
+        strideB: ::core::ffi::c_longlong,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: ::core::ffi::c_int,
+        strideC: ::core::ffi::c_longlong,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgemmStridedBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        strideA: ::core::ffi::c_longlong,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: i64,
+        strideB: ::core::ffi::c_longlong,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: i64,
+        strideC: ::core::ffi::c_longlong,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgemm3mStridedBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        strideA: ::core::ffi::c_longlong,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: ::core::ffi::c_int,
+        strideB: ::core::ffi::c_longlong,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: ::core::ffi::c_int,
+        strideC: ::core::ffi::c_longlong,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgemm3mStridedBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        strideA: ::core::ffi::c_longlong,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: i64,
+        strideB: ::core::ffi::c_longlong,
+        beta: *const cuda_types::cublas::cuComplex,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: i64,
+        strideC: ::core::ffi::c_longlong,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgemmStridedBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        strideA: ::core::ffi::c_longlong,
+        B: *const cuda_types::cublas::cuDoubleComplex,
+        ldb: ::core::ffi::c_int,
+        strideB: ::core::ffi::c_longlong,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: ::core::ffi::c_int,
+        strideC: ::core::ffi::c_longlong,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgemmStridedBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        strideA: ::core::ffi::c_longlong,
+        B: *const cuda_types::cublas::cuDoubleComplex,
+        ldb: i64,
+        strideB: ::core::ffi::c_longlong,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: i64,
+        strideC: ::core::ffi::c_longlong,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGemmBatchedEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const ::core::ffi::c_void,
+        Aarray: *const *const ::core::ffi::c_void,
+        Atype: cuda_types::cublas::cudaDataType,
+        lda: ::core::ffi::c_int,
+        Barray: *const *const ::core::ffi::c_void,
+        Btype: cuda_types::cublas::cudaDataType,
+        ldb: ::core::ffi::c_int,
+        beta: *const ::core::ffi::c_void,
+        Carray: *const *mut ::core::ffi::c_void,
+        Ctype: cuda_types::cublas::cudaDataType,
+        ldc: ::core::ffi::c_int,
+        batchCount: ::core::ffi::c_int,
+        computeType: cuda_types::cublas::cublasComputeType_t,
+        algo: cuda_types::cublas::cublasGemmAlgo_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGemmBatchedEx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const ::core::ffi::c_void,
+        Aarray: *const *const ::core::ffi::c_void,
+        Atype: cuda_types::cublas::cudaDataType,
+        lda: i64,
+        Barray: *const *const ::core::ffi::c_void,
+        Btype: cuda_types::cublas::cudaDataType,
+        ldb: i64,
+        beta: *const ::core::ffi::c_void,
+        Carray: *const *mut ::core::ffi::c_void,
+        Ctype: cuda_types::cublas::cudaDataType,
+        ldc: i64,
+        batchCount: i64,
+        computeType: cuda_types::cublas::cublasComputeType_t,
+        algo: cuda_types::cublas::cublasGemmAlgo_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGemmStridedBatchedEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        alpha: *const ::core::ffi::c_void,
+        A: *const ::core::ffi::c_void,
+        Atype: cuda_types::cublas::cudaDataType,
+        lda: ::core::ffi::c_int,
+        strideA: ::core::ffi::c_longlong,
+        B: *const ::core::ffi::c_void,
+        Btype: cuda_types::cublas::cudaDataType,
+        ldb: ::core::ffi::c_int,
+        strideB: ::core::ffi::c_longlong,
+        beta: *const ::core::ffi::c_void,
+        C: *mut ::core::ffi::c_void,
+        Ctype: cuda_types::cublas::cudaDataType,
+        ldc: ::core::ffi::c_int,
+        strideC: ::core::ffi::c_longlong,
+        batchCount: ::core::ffi::c_int,
+        computeType: cuda_types::cublas::cublasComputeType_t,
+        algo: cuda_types::cublas::cublasGemmAlgo_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGemmStridedBatchedEx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        k: i64,
+        alpha: *const ::core::ffi::c_void,
+        A: *const ::core::ffi::c_void,
+        Atype: cuda_types::cublas::cudaDataType,
+        lda: i64,
+        strideA: ::core::ffi::c_longlong,
+        B: *const ::core::ffi::c_void,
+        Btype: cuda_types::cublas::cudaDataType,
+        ldb: i64,
+        strideB: ::core::ffi::c_longlong,
+        beta: *const ::core::ffi::c_void,
+        C: *mut ::core::ffi::c_void,
+        Ctype: cuda_types::cublas::cudaDataType,
+        ldc: i64,
+        strideC: ::core::ffi::c_longlong,
+        batchCount: i64,
+        computeType: cuda_types::cublas::cublasComputeType_t,
+        algo: cuda_types::cublas::cublasGemmAlgo_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgemmGroupedBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa_array: *const cuda_types::cublas::cublasOperation_t,
+        transb_array: *const cuda_types::cublas::cublasOperation_t,
+        m_array: *const ::core::ffi::c_int,
+        n_array: *const ::core::ffi::c_int,
+        k_array: *const ::core::ffi::c_int,
+        alpha_array: *const f32,
+        Aarray: *const *const f32,
+        lda_array: *const ::core::ffi::c_int,
+        Barray: *const *const f32,
+        ldb_array: *const ::core::ffi::c_int,
+        beta_array: *const f32,
+        Carray: *const *mut f32,
+        ldc_array: *const ::core::ffi::c_int,
+        group_count: ::core::ffi::c_int,
+        group_size: *const ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgemmGroupedBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa_array: *const cuda_types::cublas::cublasOperation_t,
+        transb_array: *const cuda_types::cublas::cublasOperation_t,
+        m_array: *const i64,
+        n_array: *const i64,
+        k_array: *const i64,
+        alpha_array: *const f32,
+        Aarray: *const *const f32,
+        lda_array: *const i64,
+        Barray: *const *const f32,
+        ldb_array: *const i64,
+        beta_array: *const f32,
+        Carray: *const *mut f32,
+        ldc_array: *const i64,
+        group_count: i64,
+        group_size: *const i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDgemmGroupedBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa_array: *const cuda_types::cublas::cublasOperation_t,
+        transb_array: *const cuda_types::cublas::cublasOperation_t,
+        m_array: *const ::core::ffi::c_int,
+        n_array: *const ::core::ffi::c_int,
+        k_array: *const ::core::ffi::c_int,
+        alpha_array: *const f64,
+        Aarray: *const *const f64,
+        lda_array: *const ::core::ffi::c_int,
+        Barray: *const *const f64,
+        ldb_array: *const ::core::ffi::c_int,
+        beta_array: *const f64,
+        Carray: *const *mut f64,
+        ldc_array: *const ::core::ffi::c_int,
+        group_count: ::core::ffi::c_int,
+        group_size: *const ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDgemmGroupedBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa_array: *const cuda_types::cublas::cublasOperation_t,
+        transb_array: *const cuda_types::cublas::cublasOperation_t,
+        m_array: *const i64,
+        n_array: *const i64,
+        k_array: *const i64,
+        alpha_array: *const f64,
+        Aarray: *const *const f64,
+        lda_array: *const i64,
+        Barray: *const *const f64,
+        ldb_array: *const i64,
+        beta_array: *const f64,
+        Carray: *const *mut f64,
+        ldc_array: *const i64,
+        group_count: i64,
+        group_size: *const i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGemmGroupedBatchedEx(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa_array: *const cuda_types::cublas::cublasOperation_t,
+        transb_array: *const cuda_types::cublas::cublasOperation_t,
+        m_array: *const ::core::ffi::c_int,
+        n_array: *const ::core::ffi::c_int,
+        k_array: *const ::core::ffi::c_int,
+        alpha_array: *const ::core::ffi::c_void,
+        Aarray: *const *const ::core::ffi::c_void,
+        Atype: cuda_types::cublas::cudaDataType_t,
+        lda_array: *const ::core::ffi::c_int,
+        Barray: *const *const ::core::ffi::c_void,
+        Btype: cuda_types::cublas::cudaDataType_t,
+        ldb_array: *const ::core::ffi::c_int,
+        beta_array: *const ::core::ffi::c_void,
+        Carray: *const *mut ::core::ffi::c_void,
+        Ctype: cuda_types::cublas::cudaDataType_t,
+        ldc_array: *const ::core::ffi::c_int,
+        group_count: ::core::ffi::c_int,
+        group_size: *const ::core::ffi::c_int,
+        computeType: cuda_types::cublas::cublasComputeType_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasGemmGroupedBatchedEx_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa_array: *const cuda_types::cublas::cublasOperation_t,
+        transb_array: *const cuda_types::cublas::cublasOperation_t,
+        m_array: *const i64,
+        n_array: *const i64,
+        k_array: *const i64,
+        alpha_array: *const ::core::ffi::c_void,
+        Aarray: *const *const ::core::ffi::c_void,
+        Atype: cuda_types::cublas::cudaDataType_t,
+        lda_array: *const i64,
+        Barray: *const *const ::core::ffi::c_void,
+        Btype: cuda_types::cublas::cudaDataType_t,
+        ldb_array: *const i64,
+        beta_array: *const ::core::ffi::c_void,
+        Carray: *const *mut ::core::ffi::c_void,
+        Ctype: cuda_types::cublas::cudaDataType_t,
+        ldc_array: *const i64,
+        group_count: i64,
+        group_size: *const i64,
+        computeType: cuda_types::cublas::cublasComputeType_t,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgeam(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        beta: *const f32,
+        B: *const f32,
+        ldb: ::core::ffi::c_int,
+        C: *mut f32,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgeam_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const f32,
+        A: *const f32,
+        lda: i64,
+        beta: *const f32,
+        B: *const f32,
+        ldb: i64,
+        C: *mut f32,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDgeam(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f64,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        beta: *const f64,
+        B: *const f64,
+        ldb: ::core::ffi::c_int,
+        C: *mut f64,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDgeam_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const f64,
+        A: *const f64,
+        lda: i64,
+        beta: *const f64,
+        B: *const f64,
+        ldb: i64,
+        C: *mut f64,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgeam(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuComplex,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: ::core::ffi::c_int,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgeam_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        beta: *const cuda_types::cublas::cuComplex,
+        B: *const cuda_types::cublas::cuComplex,
+        ldb: i64,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgeam(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        B: *const cuda_types::cublas::cuDoubleComplex,
+        ldb: ::core::ffi::c_int,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgeam_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        m: i64,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        beta: *const cuda_types::cublas::cuDoubleComplex,
+        B: *const cuda_types::cublas::cuDoubleComplex,
+        ldb: i64,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasStrsmBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const *const f32,
+        lda: ::core::ffi::c_int,
+        B: *const *mut f32,
+        ldb: ::core::ffi::c_int,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasStrsmBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: i64,
+        n: i64,
+        alpha: *const f32,
+        A: *const *const f32,
+        lda: i64,
+        B: *const *mut f32,
+        ldb: i64,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDtrsmBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f64,
+        A: *const *const f64,
+        lda: ::core::ffi::c_int,
+        B: *const *mut f64,
+        ldb: ::core::ffi::c_int,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDtrsmBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: i64,
+        n: i64,
+        alpha: *const f64,
+        A: *const *const f64,
+        lda: i64,
+        B: *const *mut f64,
+        ldb: i64,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCtrsmBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        B: *const *mut cuda_types::cublas::cuComplex,
+        ldb: ::core::ffi::c_int,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCtrsmBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: i64,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuComplex,
+        A: *const *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        B: *const *mut cuda_types::cublas::cuComplex,
+        ldb: i64,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZtrsmBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        B: *const *mut cuda_types::cublas::cuDoubleComplex,
+        ldb: ::core::ffi::c_int,
+        batchCount: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZtrsmBatched_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        side: cuda_types::cublas::cublasSideMode_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        diag: cuda_types::cublas::cublasDiagType_t,
+        m: i64,
+        n: i64,
+        alpha: *const cuda_types::cublas::cuDoubleComplex,
+        A: *const *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        B: *const *mut cuda_types::cublas::cuDoubleComplex,
+        ldb: i64,
+        batchCount: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSdgmm(
+        handle: cuda_types::cublas::cublasHandle_t,
+        mode: cuda_types::cublas::cublasSideMode_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        x: *const f32,
+        incx: ::core::ffi::c_int,
+        C: *mut f32,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSdgmm_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        mode: cuda_types::cublas::cublasSideMode_t,
+        m: i64,
+        n: i64,
+        A: *const f32,
+        lda: i64,
+        x: *const f32,
+        incx: i64,
+        C: *mut f32,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDdgmm(
+        handle: cuda_types::cublas::cublasHandle_t,
+        mode: cuda_types::cublas::cublasSideMode_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        x: *const f64,
+        incx: ::core::ffi::c_int,
+        C: *mut f64,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDdgmm_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        mode: cuda_types::cublas::cublasSideMode_t,
+        m: i64,
+        n: i64,
+        A: *const f64,
+        lda: i64,
+        x: *const f64,
+        incx: i64,
+        C: *mut f64,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCdgmm(
+        handle: cuda_types::cublas::cublasHandle_t,
+        mode: cuda_types::cublas::cublasSideMode_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: ::core::ffi::c_int,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCdgmm_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        mode: cuda_types::cublas::cublasSideMode_t,
+        m: i64,
+        n: i64,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: i64,
+        x: *const cuda_types::cublas::cuComplex,
+        incx: i64,
+        C: *mut cuda_types::cublas::cuComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZdgmm(
+        handle: cuda_types::cublas::cublasHandle_t,
+        mode: cuda_types::cublas::cublasSideMode_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: ::core::ffi::c_int,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZdgmm_64(
+        handle: cuda_types::cublas::cublasHandle_t,
+        mode: cuda_types::cublas::cublasSideMode_t,
+        m: i64,
+        n: i64,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: i64,
+        x: *const cuda_types::cublas::cuDoubleComplex,
+        incx: i64,
+        C: *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: i64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSmatinvBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        A: *const *const f32,
+        lda: ::core::ffi::c_int,
+        Ainv: *const *mut f32,
+        lda_inv: ::core::ffi::c_int,
+        info: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDmatinvBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        A: *const *const f64,
+        lda: ::core::ffi::c_int,
+        Ainv: *const *mut f64,
+        lda_inv: ::core::ffi::c_int,
+        info: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCmatinvBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        A: *const *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        Ainv: *const *mut cuda_types::cublas::cuComplex,
+        lda_inv: ::core::ffi::c_int,
+        info: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZmatinvBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        A: *const *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        Ainv: *const *mut cuda_types::cublas::cuDoubleComplex,
+        lda_inv: ::core::ffi::c_int,
+        info: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgeqrfBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        Aarray: *const *mut f32,
+        lda: ::core::ffi::c_int,
+        TauArray: *const *mut f32,
+        info: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDgeqrfBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        Aarray: *const *mut f64,
+        lda: ::core::ffi::c_int,
+        TauArray: *const *mut f64,
+        info: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgeqrfBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        Aarray: *const *mut cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        TauArray: *const *mut cuda_types::cublas::cuComplex,
+        info: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgeqrfBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        Aarray: *const *mut cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        TauArray: *const *mut cuda_types::cublas::cuDoubleComplex,
+        info: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgelsBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nrhs: ::core::ffi::c_int,
+        Aarray: *const *mut f32,
+        lda: ::core::ffi::c_int,
+        Carray: *const *mut f32,
+        ldc: ::core::ffi::c_int,
+        info: *mut ::core::ffi::c_int,
+        devInfoArray: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDgelsBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nrhs: ::core::ffi::c_int,
+        Aarray: *const *mut f64,
+        lda: ::core::ffi::c_int,
+        Carray: *const *mut f64,
+        ldc: ::core::ffi::c_int,
+        info: *mut ::core::ffi::c_int,
+        devInfoArray: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgelsBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nrhs: ::core::ffi::c_int,
+        Aarray: *const *mut cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        Carray: *const *mut cuda_types::cublas::cuComplex,
+        ldc: ::core::ffi::c_int,
+        info: *mut ::core::ffi::c_int,
+        devInfoArray: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgelsBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nrhs: ::core::ffi::c_int,
+        Aarray: *const *mut cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        Carray: *const *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: ::core::ffi::c_int,
+        info: *mut ::core::ffi::c_int,
+        devInfoArray: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasStpttr(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        AP: *const f32,
+        A: *mut f32,
+        lda: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDtpttr(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        AP: *const f64,
+        A: *mut f64,
+        lda: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCtpttr(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        AP: *const cuda_types::cublas::cuComplex,
+        A: *mut cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZtpttr(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        AP: *const cuda_types::cublas::cuDoubleComplex,
+        A: *mut cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasStrttp(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        AP: *mut f32,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDtrttp(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        AP: *mut f64,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCtrttp(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        A: *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        AP: *mut cuda_types::cublas::cuComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZtrttp(
+        handle: cuda_types::cublas::cublasHandle_t,
+        uplo: cuda_types::cublas::cublasFillMode_t,
+        n: ::core::ffi::c_int,
+        A: *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        AP: *mut cuda_types::cublas::cuDoubleComplex,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgetrfBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        A: *const *mut f32,
+        lda: ::core::ffi::c_int,
+        P: *mut ::core::ffi::c_int,
+        info: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDgetrfBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        A: *const *mut f64,
+        lda: ::core::ffi::c_int,
+        P: *mut ::core::ffi::c_int,
+        info: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgetrfBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        A: *const *mut cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        P: *mut ::core::ffi::c_int,
+        info: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgetrfBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        A: *const *mut cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        P: *mut ::core::ffi::c_int,
+        info: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgetriBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        A: *const *const f32,
+        lda: ::core::ffi::c_int,
+        P: *const ::core::ffi::c_int,
+        C: *const *mut f32,
+        ldc: ::core::ffi::c_int,
+        info: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDgetriBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        A: *const *const f64,
+        lda: ::core::ffi::c_int,
+        P: *const ::core::ffi::c_int,
+        C: *const *mut f64,
+        ldc: ::core::ffi::c_int,
+        info: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgetriBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        A: *const *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        P: *const ::core::ffi::c_int,
+        C: *const *mut cuda_types::cublas::cuComplex,
+        ldc: ::core::ffi::c_int,
+        info: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgetriBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        n: ::core::ffi::c_int,
+        A: *const *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        P: *const ::core::ffi::c_int,
+        C: *const *mut cuda_types::cublas::cuDoubleComplex,
+        ldc: ::core::ffi::c_int,
+        info: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasSgetrsBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        nrhs: ::core::ffi::c_int,
+        Aarray: *const *const f32,
+        lda: ::core::ffi::c_int,
+        devIpiv: *const ::core::ffi::c_int,
+        Barray: *const *mut f32,
+        ldb: ::core::ffi::c_int,
+        info: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasDgetrsBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        nrhs: ::core::ffi::c_int,
+        Aarray: *const *const f64,
+        lda: ::core::ffi::c_int,
+        devIpiv: *const ::core::ffi::c_int,
+        Barray: *const *mut f64,
+        ldb: ::core::ffi::c_int,
+        info: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasCgetrsBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        nrhs: ::core::ffi::c_int,
+        Aarray: *const *const cuda_types::cublas::cuComplex,
+        lda: ::core::ffi::c_int,
+        devIpiv: *const ::core::ffi::c_int,
+        Barray: *const *mut cuda_types::cublas::cuComplex,
+        ldb: ::core::ffi::c_int,
+        info: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasZgetrsBatched(
+        handle: cuda_types::cublas::cublasHandle_t,
+        trans: cuda_types::cublas::cublasOperation_t,
+        n: ::core::ffi::c_int,
+        nrhs: ::core::ffi::c_int,
+        Aarray: *const *const cuda_types::cublas::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        devIpiv: *const ::core::ffi::c_int,
+        Barray: *const *mut cuda_types::cublas::cuDoubleComplex,
+        ldb: ::core::ffi::c_int,
+        info: *mut ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+    #[must_use]
+    fn cublasUint8gemmBias(
+        handle: cuda_types::cublas::cublasHandle_t,
+        transa: cuda_types::cublas::cublasOperation_t,
+        transb: cuda_types::cublas::cublasOperation_t,
+        transc: cuda_types::cublas::cublasOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        k: ::core::ffi::c_int,
+        A: *const ::core::ffi::c_uchar,
+        A_bias: ::core::ffi::c_int,
+        lda: ::core::ffi::c_int,
+        B: *const ::core::ffi::c_uchar,
+        B_bias: ::core::ffi::c_int,
+        ldb: ::core::ffi::c_int,
+        C: *mut ::core::ffi::c_uchar,
+        C_bias: ::core::ffi::c_int,
+        ldc: ::core::ffi::c_int,
+        C_mult: ::core::ffi::c_int,
+        C_shift: ::core::ffi::c_int,
+    ) -> cuda_types::cublas::cublasStatus_t;
+}
diff --git a/cuda_base/src/cublaslt.rs b/cuda_base/src/cublaslt.rs
new file mode 100644
index 0000000..b18cc1c
--- /dev/null
+++ b/cuda_base/src/cublaslt.rs
@@ -0,0 +1,581 @@
+// Generated automatically by zluda_bindgen
+// DO NOT EDIT MANUALLY
+#![allow(warnings)]
+extern "system" {
+    #[must_use]
+    fn cublasLtCreate(
+        lightHandle: *mut cuda_types::cublaslt::cublasLtHandle_t,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    fn cublasLtDestroy(
+        lightHandle: cuda_types::cublaslt::cublasLtHandle_t,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    fn cublasLtGetStatusName(
+        status: cuda_types::cublaslt::cublasStatus_t,
+    ) -> *const ::core::ffi::c_char;
+    fn cublasLtGetStatusString(
+        status: cuda_types::cublaslt::cublasStatus_t,
+    ) -> *const ::core::ffi::c_char;
+    fn cublasLtGetVersion() -> usize;
+    fn cublasLtGetCudartVersion() -> usize;
+    #[must_use]
+    fn cublasLtGetProperty(
+        type_: cuda_types::cublaslt::libraryPropertyType,
+        value: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    fn cublasLtHeuristicsCacheGetCapacity(
+        capacity: *mut usize,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    fn cublasLtHeuristicsCacheSetCapacity(
+        capacity: usize,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    /** Restricts usage of CPU instructions (ISA) specified by the flags in the mask.
+
+ Flags can be combined with bitwise OR(|) operator. Supported flags:
+ - 0x1 -- x86-64 AVX512 ISA
+
+ Default mask: 0 (any applicable ISA is allowed).
+
+ The function returns the previous value of the mask.
+ The function takes precedence over the environment variable CUBLASLT_DISABLE_CPU_INSTRUCTIONS_MASK.*/
+    fn cublasLtDisableCpuInstructionsSetMask(
+        mask: ::core::ffi::c_uint,
+    ) -> ::core::ffi::c_uint;
+    #[must_use]
+    /** Execute matrix multiplication (D = alpha * op(A) * op(B) + beta * C).
+
+ \retval     CUBLAS_STATUS_NOT_INITIALIZED   if cuBLASLt handle has not been initialized
+ \retval     CUBLAS_STATUS_INVALID_VALUE     if parameters are in conflict or in an impossible configuration; e.g.
+                                             when workspaceSizeInBytes is less than workspace required by configured
+                                             algo
+ \retval     CUBLAS_STATUS_NOT_SUPPORTED     if current implementation on selected device doesn't support configured
+                                             operation
+ \retval     CUBLAS_STATUS_ARCH_MISMATCH     if configured operation cannot be run using selected device
+ \retval     CUBLAS_STATUS_EXECUTION_FAILED  if cuda reported execution error from the device
+ \retval     CUBLAS_STATUS_SUCCESS           if the operation completed successfully*/
+    fn cublasLtMatmul(
+        lightHandle: cuda_types::cublaslt::cublasLtHandle_t,
+        computeDesc: cuda_types::cublaslt::cublasLtMatmulDesc_t,
+        alpha: *const ::core::ffi::c_void,
+        A: *const ::core::ffi::c_void,
+        Adesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
+        B: *const ::core::ffi::c_void,
+        Bdesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
+        beta: *const ::core::ffi::c_void,
+        C: *const ::core::ffi::c_void,
+        Cdesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
+        D: *mut ::core::ffi::c_void,
+        Ddesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
+        algo: *const cuda_types::cublaslt::cublasLtMatmulAlgo_t,
+        workspace: *mut ::core::ffi::c_void,
+        workspaceSizeInBytes: usize,
+        stream: cuda_types::cublaslt::cudaStream_t,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Matrix layout conversion helper (C = alpha * op(A) + beta * op(B))
+
+ Can be used to change memory order of data or to scale and shift the values.
+
+ \retval     CUBLAS_STATUS_NOT_INITIALIZED   if cuBLASLt handle has not been initialized
+ \retval     CUBLAS_STATUS_INVALID_VALUE     if parameters are in conflict or in an impossible configuration; e.g.
+                                             when A is not NULL, but Adesc is NULL
+ \retval     CUBLAS_STATUS_NOT_SUPPORTED     if current implementation on selected device doesn't support configured
+                                             operation
+ \retval     CUBLAS_STATUS_ARCH_MISMATCH     if configured operation cannot be run using selected device
+ \retval     CUBLAS_STATUS_EXECUTION_FAILED  if cuda reported execution error from the device
+ \retval     CUBLAS_STATUS_SUCCESS           if the operation completed successfully*/
+    fn cublasLtMatrixTransform(
+        lightHandle: cuda_types::cublaslt::cublasLtHandle_t,
+        transformDesc: cuda_types::cublaslt::cublasLtMatrixTransformDesc_t,
+        alpha: *const ::core::ffi::c_void,
+        A: *const ::core::ffi::c_void,
+        Adesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
+        beta: *const ::core::ffi::c_void,
+        B: *const ::core::ffi::c_void,
+        Bdesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
+        C: *mut ::core::ffi::c_void,
+        Cdesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
+        stream: cuda_types::cublaslt::cudaStream_t,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /// Internal. Do not use directly.
+    fn cublasLtMatrixLayoutInit_internal(
+        matLayout: cuda_types::cublaslt::cublasLtMatrixLayout_t,
+        size: usize,
+        type_: cuda_types::cublaslt::cudaDataType,
+        rows: u64,
+        cols: u64,
+        ld: i64,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Create new matrix layout descriptor.
+
+ \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
+ \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully*/
+    fn cublasLtMatrixLayoutCreate(
+        matLayout: *mut cuda_types::cublaslt::cublasLtMatrixLayout_t,
+        type_: cuda_types::cublaslt::cudaDataType,
+        rows: u64,
+        cols: u64,
+        ld: i64,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Destroy matrix layout descriptor.
+
+ \retval     CUBLAS_STATUS_SUCCESS  if operation was successful*/
+    fn cublasLtMatrixLayoutDestroy(
+        matLayout: cuda_types::cublaslt::cublasLtMatrixLayout_t,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Set matrix layout descriptor attribute.
+
+ \param[in]  matLayout    The descriptor
+ \param[in]  attr         The attribute
+ \param[in]  buf          memory address containing the new value
+ \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+
+ \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
+                                          selected attribute
+ \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully*/
+    fn cublasLtMatrixLayoutSetAttribute(
+        matLayout: cuda_types::cublaslt::cublasLtMatrixLayout_t,
+        attr: cuda_types::cublaslt::cublasLtMatrixLayoutAttribute_t,
+        buf: *const ::core::ffi::c_void,
+        sizeInBytes: usize,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Get matrix layout descriptor attribute.
+
+ \param[in]  matLayout    The descriptor
+ \param[in]  attr         The attribute
+ \param[out] buf          memory address containing the new value
+ \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
+                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+
+ \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+                                          selected attribute
+ \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory*/
+    fn cublasLtMatrixLayoutGetAttribute(
+        matLayout: cuda_types::cublaslt::cublasLtMatrixLayout_t,
+        attr: cuda_types::cublaslt::cublasLtMatrixLayoutAttribute_t,
+        buf: *mut ::core::ffi::c_void,
+        sizeInBytes: usize,
+        sizeWritten: *mut usize,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /// Internal. Do not use directly.
+    fn cublasLtMatmulDescInit_internal(
+        matmulDesc: cuda_types::cublaslt::cublasLtMatmulDesc_t,
+        size: usize,
+        computeType: cuda_types::cublaslt::cublasComputeType_t,
+        scaleType: cuda_types::cublaslt::cudaDataType_t,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Create new matmul operation descriptor.
+
+ \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
+ \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully*/
+    fn cublasLtMatmulDescCreate(
+        matmulDesc: *mut cuda_types::cublaslt::cublasLtMatmulDesc_t,
+        computeType: cuda_types::cublaslt::cublasComputeType_t,
+        scaleType: cuda_types::cublaslt::cudaDataType_t,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Destroy matmul operation descriptor.
+
+ \retval     CUBLAS_STATUS_SUCCESS  if operation was successful*/
+    fn cublasLtMatmulDescDestroy(
+        matmulDesc: cuda_types::cublaslt::cublasLtMatmulDesc_t,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Set matmul operation descriptor attribute.
+
+ \param[in]  matmulDesc   The descriptor
+ \param[in]  attr         The attribute
+ \param[in]  buf          memory address containing the new value
+ \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+
+ \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
+                                          selected attribute
+ \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully*/
+    fn cublasLtMatmulDescSetAttribute(
+        matmulDesc: cuda_types::cublaslt::cublasLtMatmulDesc_t,
+        attr: cuda_types::cublaslt::cublasLtMatmulDescAttributes_t,
+        buf: *const ::core::ffi::c_void,
+        sizeInBytes: usize,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Get matmul operation descriptor attribute.
+
+ \param[in]  matmulDesc   The descriptor
+ \param[in]  attr         The attribute
+ \param[out] buf          memory address containing the new value
+ \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
+                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+
+ \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+                                          selected attribute
+ \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory*/
+    fn cublasLtMatmulDescGetAttribute(
+        matmulDesc: cuda_types::cublaslt::cublasLtMatmulDesc_t,
+        attr: cuda_types::cublaslt::cublasLtMatmulDescAttributes_t,
+        buf: *mut ::core::ffi::c_void,
+        sizeInBytes: usize,
+        sizeWritten: *mut usize,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /// Internal. Do not use directly.
+    fn cublasLtMatrixTransformDescInit_internal(
+        transformDesc: cuda_types::cublaslt::cublasLtMatrixTransformDesc_t,
+        size: usize,
+        scaleType: cuda_types::cublaslt::cudaDataType,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Create new matrix transform operation descriptor.
+
+ \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
+ \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully*/
+    fn cublasLtMatrixTransformDescCreate(
+        transformDesc: *mut cuda_types::cublaslt::cublasLtMatrixTransformDesc_t,
+        scaleType: cuda_types::cublaslt::cudaDataType,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Destroy matrix transform operation descriptor.
+
+ \retval     CUBLAS_STATUS_SUCCESS  if operation was successful*/
+    fn cublasLtMatrixTransformDescDestroy(
+        transformDesc: cuda_types::cublaslt::cublasLtMatrixTransformDesc_t,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Set matrix transform operation descriptor attribute.
+
+ \param[in]  transformDesc  The descriptor
+ \param[in]  attr           The attribute
+ \param[in]  buf            memory address containing the new value
+ \param[in]  sizeInBytes    size of buf buffer for verification (in bytes)
+
+ \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
+                                          selected attribute
+ \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully*/
+    fn cublasLtMatrixTransformDescSetAttribute(
+        transformDesc: cuda_types::cublaslt::cublasLtMatrixTransformDesc_t,
+        attr: cuda_types::cublaslt::cublasLtMatrixTransformDescAttributes_t,
+        buf: *const ::core::ffi::c_void,
+        sizeInBytes: usize,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Get matrix transform operation descriptor attribute.
+
+ \param[in]  transformDesc  The descriptor
+ \param[in]  attr           The attribute
+ \param[out] buf            memory address containing the new value
+ \param[in]  sizeInBytes    size of buf buffer for verification (in bytes)
+ \param[out] sizeWritten    only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number
+ of bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+
+ \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+                                          selected attribute
+ \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory*/
+    fn cublasLtMatrixTransformDescGetAttribute(
+        transformDesc: cuda_types::cublaslt::cublasLtMatrixTransformDesc_t,
+        attr: cuda_types::cublaslt::cublasLtMatrixTransformDescAttributes_t,
+        buf: *mut ::core::ffi::c_void,
+        sizeInBytes: usize,
+        sizeWritten: *mut usize,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /// Internal. Do not use directly.
+    fn cublasLtMatmulPreferenceInit_internal(
+        pref: cuda_types::cublaslt::cublasLtMatmulPreference_t,
+        size: usize,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Create new matmul heuristic search preference descriptor.
+
+ \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
+ \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully*/
+    fn cublasLtMatmulPreferenceCreate(
+        pref: *mut cuda_types::cublaslt::cublasLtMatmulPreference_t,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Destroy matmul heuristic search preference descriptor.
+
+ \retval     CUBLAS_STATUS_SUCCESS  if operation was successful*/
+    fn cublasLtMatmulPreferenceDestroy(
+        pref: cuda_types::cublaslt::cublasLtMatmulPreference_t,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Set matmul heuristic search preference descriptor attribute.
+
+ \param[in]  pref         The descriptor
+ \param[in]  attr         The attribute
+ \param[in]  buf          memory address containing the new value
+ \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+
+ \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
+                                          selected attribute
+ \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully*/
+    fn cublasLtMatmulPreferenceSetAttribute(
+        pref: cuda_types::cublaslt::cublasLtMatmulPreference_t,
+        attr: cuda_types::cublaslt::cublasLtMatmulPreferenceAttributes_t,
+        buf: *const ::core::ffi::c_void,
+        sizeInBytes: usize,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Get matmul heuristic search preference descriptor attribute.
+
+ \param[in]  pref         The descriptor
+ \param[in]  attr         The attribute
+ \param[out] buf          memory address containing the new value
+ \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
+                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+
+ \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+                                          selected attribute
+ \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory*/
+    fn cublasLtMatmulPreferenceGetAttribute(
+        pref: cuda_types::cublaslt::cublasLtMatmulPreference_t,
+        attr: cuda_types::cublaslt::cublasLtMatmulPreferenceAttributes_t,
+        buf: *mut ::core::ffi::c_void,
+        sizeInBytes: usize,
+        sizeWritten: *mut usize,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Query cublasLt heuristic for algorithm appropriate for given use case.
+
+ \param[in]      lightHandle            Pointer to the allocated cuBLASLt handle for the cuBLASLt
+                                        context. See cublasLtHandle_t.
+ \param[in]      operationDesc          Handle to the matrix multiplication descriptor.
+ \param[in]      Adesc                  Handle to the layout descriptors for matrix A.
+ \param[in]      Bdesc                  Handle to the layout descriptors for matrix B.
+ \param[in]      Cdesc                  Handle to the layout descriptors for matrix C.
+ \param[in]      Ddesc                  Handle to the layout descriptors for matrix D.
+ \param[in]      preference             Pointer to the structure holding the heuristic search
+                                        preferences descriptor. See cublasLtMatrixLayout_t.
+ \param[in]      requestedAlgoCount     Size of heuristicResultsArray (in elements) and requested
+                                        maximum number of algorithms to return.
+ \param[in, out] heuristicResultsArray  Output algorithms and associated runtime characteristics,
+                                        ordered in increasing estimated compute time.
+ \param[out]     returnAlgoCount        The number of heuristicResultsArray elements written.
+
+ \retval  CUBLAS_STATUS_INVALID_VALUE   if requestedAlgoCount is less or equal to zero
+ \retval  CUBLAS_STATUS_NOT_SUPPORTED   if no heuristic function available for current configuration
+ \retval  CUBLAS_STATUS_SUCCESS         if query was successful, inspect
+                                        heuristicResultsArray[0 to (returnAlgoCount - 1)].state
+                                        for detail status of results*/
+    fn cublasLtMatmulAlgoGetHeuristic(
+        lightHandle: cuda_types::cublaslt::cublasLtHandle_t,
+        operationDesc: cuda_types::cublaslt::cublasLtMatmulDesc_t,
+        Adesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
+        Bdesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
+        Cdesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
+        Ddesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
+        preference: cuda_types::cublaslt::cublasLtMatmulPreference_t,
+        requestedAlgoCount: ::core::ffi::c_int,
+        heuristicResultsArray: *mut cuda_types::cublaslt::cublasLtMatmulHeuristicResult_t,
+        returnAlgoCount: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Routine to get all algo IDs that can potentially run
+
+ \param[in]  int              requestedAlgoCount requested number of algos (must be less or equal to size of algoIdsA
+ (in elements)) \param[out] algoIdsA         array to write algoIds to \param[out] returnAlgoCount  number of algoIds
+ actually written
+
+ \retval     CUBLAS_STATUS_INVALID_VALUE  if requestedAlgoCount is less or equal to zero
+ \retval     CUBLAS_STATUS_SUCCESS        if query was successful, inspect returnAlgoCount to get actual number of IDs
+                                          available*/
+    fn cublasLtMatmulAlgoGetIds(
+        lightHandle: cuda_types::cublaslt::cublasLtHandle_t,
+        computeType: cuda_types::cublaslt::cublasComputeType_t,
+        scaleType: cuda_types::cublaslt::cudaDataType_t,
+        Atype: cuda_types::cublaslt::cudaDataType_t,
+        Btype: cuda_types::cublaslt::cudaDataType_t,
+        Ctype: cuda_types::cublaslt::cudaDataType_t,
+        Dtype: cuda_types::cublaslt::cudaDataType_t,
+        requestedAlgoCount: ::core::ffi::c_int,
+        algoIdsArray: *mut ::core::ffi::c_int,
+        returnAlgoCount: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Initialize algo structure
+
+ \retval     CUBLAS_STATUS_INVALID_VALUE  if algo is NULL or algoId is outside of recognized range
+ \retval     CUBLAS_STATUS_NOT_SUPPORTED  if algoId is not supported for given combination of data types
+ \retval     CUBLAS_STATUS_SUCCESS        if the structure was successfully initialized*/
+    fn cublasLtMatmulAlgoInit(
+        lightHandle: cuda_types::cublaslt::cublasLtHandle_t,
+        computeType: cuda_types::cublaslt::cublasComputeType_t,
+        scaleType: cuda_types::cublaslt::cudaDataType_t,
+        Atype: cuda_types::cublaslt::cudaDataType_t,
+        Btype: cuda_types::cublaslt::cudaDataType_t,
+        Ctype: cuda_types::cublaslt::cudaDataType_t,
+        Dtype: cuda_types::cublaslt::cudaDataType_t,
+        algoId: ::core::ffi::c_int,
+        algo: *mut cuda_types::cublaslt::cublasLtMatmulAlgo_t,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Check configured algo descriptor for correctness and support on current device.
+
+ Result includes required workspace size and calculated wave count.
+
+ CUBLAS_STATUS_SUCCESS doesn't fully guarantee algo will run (will fail if e.g. buffers are not correctly aligned);
+ but if cublasLtMatmulAlgoCheck fails, the algo will not run.
+
+ \param[in]  algo    algo configuration to check
+ \param[out] result  result structure to report algo runtime characteristics; algo field is never updated
+
+ \retval     CUBLAS_STATUS_INVALID_VALUE  if matrix layout descriptors or operation descriptor don't match algo
+                                          descriptor
+ \retval     CUBLAS_STATUS_NOT_SUPPORTED  if algo configuration or data type combination is not currently supported on
+                                          given device
+ \retval     CUBLAS_STATUS_ARCH_MISMATCH  if algo configuration cannot be run using the selected device
+ \retval     CUBLAS_STATUS_SUCCESS        if check was successful*/
+    fn cublasLtMatmulAlgoCheck(
+        lightHandle: cuda_types::cublaslt::cublasLtHandle_t,
+        operationDesc: cuda_types::cublaslt::cublasLtMatmulDesc_t,
+        Adesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
+        Bdesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
+        Cdesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
+        Ddesc: cuda_types::cublaslt::cublasLtMatrixLayout_t,
+        algo: *const cuda_types::cublaslt::cublasLtMatmulAlgo_t,
+        result: *mut cuda_types::cublaslt::cublasLtMatmulHeuristicResult_t,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Get algo capability attribute.
+
+ E.g. to get list of supported Tile IDs:
+      cublasLtMatmulTile_t tiles[CUBLASLT_MATMUL_TILE_END];
+      size_t num_tiles, size_written;
+      if (cublasLtMatmulAlgoCapGetAttribute(algo, CUBLASLT_ALGO_CAP_TILE_IDS, tiles, sizeof(tiles), size_written) ==
+ CUBLAS_STATUS_SUCCESS) { num_tiles = size_written / sizeof(tiles[0]);
+      }
+
+ \param[in]  algo         The algo descriptor
+ \param[in]  attr         The attribute
+ \param[out] buf          memory address containing the new value
+ \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
+                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+
+ \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+                                          selected attribute
+ \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory*/
+    fn cublasLtMatmulAlgoCapGetAttribute(
+        algo: *const cuda_types::cublaslt::cublasLtMatmulAlgo_t,
+        attr: cuda_types::cublaslt::cublasLtMatmulAlgoCapAttributes_t,
+        buf: *mut ::core::ffi::c_void,
+        sizeInBytes: usize,
+        sizeWritten: *mut usize,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Set algo configuration attribute.
+
+ \param[in]  algo         The algo descriptor
+ \param[in]  attr         The attribute
+ \param[in]  buf          memory address containing the new value
+ \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+
+ \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
+                                          selected attribute
+ \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully*/
+    fn cublasLtMatmulAlgoConfigSetAttribute(
+        algo: *mut cuda_types::cublaslt::cublasLtMatmulAlgo_t,
+        attr: cuda_types::cublaslt::cublasLtMatmulAlgoConfigAttributes_t,
+        buf: *const ::core::ffi::c_void,
+        sizeInBytes: usize,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Get algo configuration attribute.
+
+ \param[in]  algo         The algo descriptor
+ \param[in]  attr         The attribute
+ \param[out] buf          memory address containing the new value
+ \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
+                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+
+ \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+                                          selected attribute
+ \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory*/
+    fn cublasLtMatmulAlgoConfigGetAttribute(
+        algo: *const cuda_types::cublaslt::cublasLtMatmulAlgo_t,
+        attr: cuda_types::cublaslt::cublasLtMatmulAlgoConfigAttributes_t,
+        buf: *mut ::core::ffi::c_void,
+        sizeInBytes: usize,
+        sizeWritten: *mut usize,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Experimental: Logger callback setter.
+
+ \param[in]  callback                     a user defined callback function to be called by the logger
+
+ \retval     CUBLAS_STATUS_SUCCESS        if callback was set successfully*/
+    fn cublasLtLoggerSetCallback(
+        callback: cuda_types::cublaslt::cublasLtLoggerCallback_t,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Experimental: Log file setter.
+
+ \param[in]  file                         an open file with write permissions
+
+ \retval     CUBLAS_STATUS_SUCCESS        if log file was set successfully*/
+    fn cublasLtLoggerSetFile(file: *mut FILE) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Experimental: Open log file.
+
+ \param[in]  logFile                      log file path. if the log file does not exist, it will be created
+
+ \retval     CUBLAS_STATUS_SUCCESS        if log file was created successfully*/
+    fn cublasLtLoggerOpenFile(
+        logFile: *const ::core::ffi::c_char,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Experimental: Log level setter.
+
+ \param[in]  level                        log level, should be one of the following:
+                                          0. Off
+                                          1. Errors
+                                          2. Performance Trace
+                                          3. Performance Hints
+                                          4. Heuristics Trace
+                                          5. API Trace
+
+ \retval     CUBLAS_STATUS_INVALID_VALUE  if log level is not one of the above levels
+
+ \retval     CUBLAS_STATUS_SUCCESS        if log level was set successfully*/
+    fn cublasLtLoggerSetLevel(
+        level: ::core::ffi::c_int,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Experimental: Log mask setter.
+
+ \param[in]  mask                         log mask, should be a combination of the following masks:
+                                          0.  Off
+                                          1.  Errors
+                                          2.  Performance Trace
+                                          4.  Performance Hints
+                                          8.  Heuristics Trace
+                                          16. API Trace
+
+ \retval     CUBLAS_STATUS_SUCCESS        if log mask was set successfully*/
+    fn cublasLtLoggerSetMask(
+        mask: ::core::ffi::c_int,
+    ) -> cuda_types::cublaslt::cublasStatus_t;
+    #[must_use]
+    /** Experimental: Disable logging for the entire session.
+
+ \retval     CUBLAS_STATUS_SUCCESS        if disabled logging*/
+    fn cublasLtLoggerForceDisable() -> cuda_types::cublaslt::cublasStatus_t;
+}
diff --git a/cuda_base/src/cuda.rs b/cuda_base/src/cuda.rs
index 37aadf1..a53f6a9 100644
--- a/cuda_base/src/cuda.rs
+++ b/cuda_base/src/cuda.rs
@@ -521,6 +521,12 @@ extern "system" {
  - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING: GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here.
  - ::CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES: Bitmask of handle types supported with mempool based IPC
  - ::CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED: Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays.
+ - ::CU_DEVICE_ATTRIBUTE_NUMA_CONFIG: NUMA configuration of a device: value is of type ::CUdeviceNumaConfig enum
+ - ::CU_DEVICE_ATTRIBUTE_NUMA_ID: NUMA node ID of the GPU memory
+ - ::CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED: Device supports switch multicast and reduction operations.
+ - ::CU_DEVICE_ATTRIBUTE_GPU_PCI_DEVICE_ID: The combined 16-bit PCI device ID and 16-bit PCI vendor ID.
+ - ::CU_DEVICE_ATTRIBUTE_GPU_PCI_SUBSYSTEM_ID: The combined 16-bit PCI subsystem ID and 16-bit PCI subsystem vendor ID.
+ID.
 
  \param pi     - Returned device attribute value
  \param attrib - Device attribute to query
@@ -710,6 +716,15 @@ extern "system" {
  determined by comparing the numerical values between the two enums, with
  smaller scopes having smaller values.
 
+ On platforms that support GPUDirect RDMA writes via more than one path in
+ hardware (see ::CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE), the user should
+ consider those paths as belonging to separate ordering domains. Note that in
+ such cases CUDA driver will report both RDMA writes ordering and RDMA write
+ scope as ALL_DEVICES and a call to cuFlushGPUDirectRDMA will be a no-op,
+ but when these multiple paths are used simultaneously, it is the user's
+ responsibility to ensure ordering by using mechanisms outside the scope of
+ CUDA.
+
  Users may query support for this API via
  ::CU_DEVICE_ATTRIBUTE_FLUSH_FLUSH_GPU_DIRECT_RDMA_OPTIONS.
 
@@ -1348,6 +1363,163 @@ int textureAlign
         flags: ::core::ffi::c_uint,
         dev: cuda_types::cuda::CUdevice,
     ) -> cuda_types::cuda::CUresult;
+    /** \brief Create a CUDA context
+
+ Creates a new CUDA context and associates it with the calling thread. The
+ \p flags parameter is described below. The context is created with a usage
+ count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy()
+ when done using the context. If a context is already current to the thread,
+ it is supplanted by the newly created context and may be restored by a subsequent
+ call to ::cuCtxPopCurrent().
+
+ CUDA context can be created with execution affinity. The type and the amount of
+execution resource the context can use is limited by \p paramsArray and \p numExecAffinityParams
+in \p execAffinity. The \p paramsArray is an array of \p CUexecAffinityParam and the \p numExecAffinityParams
+ describes the size of the paramsArray. If two \p CUexecAffinityParam in the array have the same type,
+ the latter execution affinity parameter overrides the former execution affinity parameter.
+ The supported execution affinity types are:
+ - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT limits the portion of SMs that the context can use. The portion
+   of SMs is specified as the number of SMs via \p CUexecAffinitySmCount. This limit will be internally
+   rounded up to the next hardware-supported amount. Hence, it is imperative to query the actual execution
+   affinity of the context via \p cuCtxGetExecAffinity after context creation. Currently, this attribute
+   is only supported under Volta+ MPS.
+
+ CUDA context can be created in CIG(CUDA in Graphics) mode by setting \p cigParams.
+ Data from graphics client is shared with CUDA via the \p sharedData in \p cigParams.
+ Support for D3D12 graphics client can be determined using ::cuDeviceGetAttribute() with
+ ::CU_DEVICE_ATTRIBUTE_D3D12_CIG_SUPPORTED. \p sharedData is a ID3D12CommandQueue handle.
+ Either \p execAffinityParams or \p cigParams can be set to a non-null value. Setting both to a
+ non-null value will result in an undefined behavior.
+
+ The three LSBs of the \p flags parameter can be used to control how the OS
+ thread, which owns the CUDA context at the time of an API call, interacts
+ with the OS scheduler when waiting for results from the GPU. Only one of
+ the scheduling flags can be set when creating a context.
+
+ - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
+ results from the GPU. This can decrease latency when waiting for the GPU,
+ but may lower the performance of CPU threads if they are performing work in
+ parallel with the CUDA thread.
+
+ - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
+ results from the GPU. This can increase latency when waiting for the GPU,
+ but can increase the performance of CPU threads performing work in parallel
+ with the GPU.
+
+ - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ synchronization primitive when waiting for the GPU to finish work.
+
+ - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ synchronization primitive when waiting for the GPU to finish work. <br>
+ <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
+ replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
+
+ - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
+ uses a heuristic based on the number of active CUDA contexts in the
+ process \e C and the number of logical processors in the system \e P. If
+ \e C > \e P, then CUDA will yield to other OS threads when waiting for
+ the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
+ waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
+ Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
+ the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
+ for low-powered devices.
+
+ - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
+ This flag must be set in order to allocate pinned host memory that is
+ accessible to the GPU.
+
+ - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
+ after resizing local memory for a kernel. This can prevent thrashing by
+ local memory allocations when launching many kernels with high local
+ memory usage at the cost of potentially increased memory usage. <br>
+ <b>Deprecated:</b> This flag is deprecated and the behavior enabled
+ by this flag is now the default and cannot be disabled.
+ Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit().
+
+ - ::CU_CTX_COREDUMP_ENABLE: If GPU coredumps have not been enabled globally
+ with ::cuCoredumpSetAttributeGlobal or environment variables, this flag can
+ be set during context creation to instruct CUDA to create a coredump if
+ this context raises an exception during execution. These environment variables
+ are described in the CUDA-GDB user guide under the "GPU core dump support"
+ section.
+ The initial attributes will be taken from the global attributes at the time of
+ context creation. The other attributes that control coredump output can be
+ modified by calling ::cuCoredumpSetAttribute from the created context after
+ it becomes current. This flag is not supported when CUDA context is created in
+ CIG(CUDA in Graphics) mode.
+
+ - ::CU_CTX_USER_COREDUMP_ENABLE: If user-triggered GPU coredumps have not
+ been enabled globally with ::cuCoredumpSetAttributeGlobal or environment
+ variables, this flag can be set during context creation to instruct CUDA to
+ create a coredump if data is written to a certain pipe that is present in the
+ OS space. These environment variables are described in the CUDA-GDB user
+ guide under the "GPU core dump support" section.
+ It is important to note that the pipe name *must* be set with
+ ::cuCoredumpSetAttributeGlobal before creating the context if this flag is
+ used. Setting this flag implies that ::CU_CTX_COREDUMP_ENABLE is set.
+ The initial attributes will be taken from the global attributes at the time of
+ context creation. The other attributes that control coredump output can be
+ modified by calling ::cuCoredumpSetAttribute from the created context after
+ it becomes current.
+ Setting this flag on any context creation is equivalent to setting the
+ ::CU_COREDUMP_ENABLE_USER_TRIGGER attribute to \p true globally.
+ This flag is not supported when CUDA context is created in
+ CIG(CUDA in Graphics) mode.
+
+ - ::CU_CTX_SYNC_MEMOPS: Ensures that synchronous memory operations initiated
+ on this context will always synchronize. See further documentation in the
+ section titled "API Synchronization behavior" to learn more about cases when
+ synchronous memory operations can exhibit asynchronous behavior.
+
+ Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
+ the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute()
+ can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the
+ compute mode of the device. The <i>nvidia-smi</i> tool can be used to set
+ the compute mode for * devices.
+ Documentation for <i>nvidia-smi</i> can be obtained by passing a
+ -h option to it.
+
+ Context creation will fail with :: CUDA_ERROR_INVALID_VALUE if invalid parameter was
+ passed by client to create the CUDA context.
+
+ Context creation in CIG mode will fail with ::CUDA_ERROR_NOT_SUPPORTED if CIG is not supported
+ by the device or the driver.
+ \param pctx              - Returned context handle of the new context
+ \param ctxCreateParams   - Context creation parameters
+ \param flags             - Context creation flags
+ \param dev               - Device to create context on
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_SUPPORTED,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_UNKNOWN
+ \notefnerr
+
+ \sa ::cuCtxDestroy,
+ ::cuCtxGetApiVersion,
+ ::cuCtxGetCacheConfig,
+ ::cuCtxGetDevice,
+ ::cuCtxGetFlags,
+ ::cuCtxGetLimit,
+ ::cuCtxPopCurrent,
+ ::cuCtxPushCurrent,
+ ::cuCtxSetCacheConfig,
+ ::cuCtxSetLimit,
+ ::cuCoredumpSetAttributeGlobal,
+ ::cuCoredumpSetAttribute,
+ ::cuCtxSynchronize*/
+    fn cuCtxCreate_v4(
+        pctx: *mut cuda_types::cuda::CUcontext,
+        ctxCreateParams: *mut cuda_types::cuda::CUctxCreateParams,
+        flags: ::core::ffi::c_uint,
+        dev: cuda_types::cuda::CUdevice,
+    ) -> cuda_types::cuda::CUresult;
     /** \brief Destroy a CUDA context
 
  Destroys the CUDA context specified by \p ctx.  The context \p ctx will be
@@ -1358,9 +1530,11 @@ int textureAlign
  Destroys and cleans up all resources associated with the context.
  It is the caller's responsibility to ensure that the context or its resources
  are not accessed or passed in subsequent API calls and doing so will result in undefined behavior.
- These resources include CUDA types such as ::CUmodule, ::CUfunction, ::CUstream, ::CUevent,
+ These resources include CUDA types ::CUmodule, ::CUfunction, ::CUstream, ::CUevent,
  ::CUarray, ::CUmipmappedArray, ::CUtexObject, ::CUsurfObject, ::CUtexref, ::CUsurfref,
  ::CUgraphicsResource, ::CUlinkState, ::CUexternalMemory and ::CUexternalSemaphore.
+ These resources also include memory allocations by ::cuMemAlloc(), ::cuMemAllocHost(),
+ ::cuMemAllocManaged() and ::cuMemAllocPitch().
 
  If \p ctx is current to the calling thread then \p ctx will also be
  popped from the current thread's context stack (as though ::cuCtxPopCurrent()
@@ -1368,6 +1542,10 @@ int textureAlign
  remain current to those threads, and attempting to access \p ctx from
  those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED.
 
+ \note ::cuCtxDestroy() will not destroy memory allocations by ::cuMemCreate(), ::cuMemAllocAsync() and
+ ::cuMemAllocFromPoolAsync(). These memory allocations are not associated with any CUDA context and need to
+ be destroyed explicitly.
+
  \param ctx - Context to destroy
 
  \return
@@ -1505,11 +1683,11 @@ int textureAlign
     fn cuCtxGetCurrent(
         pctx: *mut cuda_types::cuda::CUcontext,
     ) -> cuda_types::cuda::CUresult;
-    /** \brief Returns the device ID for the current context
+    /** \brief Returns the device handle for the current context
 
- Returns in \p *device the ordinal of the current context's device.
+ Returns in \p *device the handle of the current context's device.
 
- \param device - Returned device ID for the current context
+ \param device - Returned device handle for the current context
 
  \return
  ::CUDA_SUCCESS,
@@ -1618,9 +1796,11 @@ int textureAlign
         ctx: cuda_types::cuda::CUcontext,
         ctxId: *mut ::core::ffi::c_ulonglong,
     ) -> cuda_types::cuda::CUresult;
-    /** \brief Block for a context's tasks to complete
+    /** \brief Block for the current context's tasks to complete
 
- Blocks until the device has completed all preceding requested tasks.
+ Blocks until the current context has completed all preceding requested tasks.
+ If the current context is the primary context, green contexts that have been
+ created will also be synchronized.
  ::cuCtxSynchronize() returns an error if one of the preceding tasks failed.
  If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the
  CPU thread will block until the GPU context has finished its work.
@@ -1995,6 +2175,80 @@ int textureAlign
         pExecAffinity: *mut cuda_types::cuda::CUexecAffinityParam,
         type_: cuda_types::cuda::CUexecAffinityType,
     ) -> cuda_types::cuda::CUresult;
+    /** \brief Records an event.
+
+ Captures in \p hEvent all the activities of the context \p hCtx
+ at the time of this call. \p hEvent and \p hCtx must be from the same
+ CUDA context, otherwise ::CUDA_ERROR_INVALID_HANDLE will be returned.
+ Calls such as ::cuEventQuery() or ::cuCtxWaitEvent() will then examine
+ or wait for completion of the work that was captured.
+ Uses of \p hCtx after this call do not modify \p hEvent.
+ If the context passed to \p hCtx is the primary context, \p hEvent will
+ capture all the activities of the primary context and its green contexts.
+ If the context passed to \p hCtx is a context converted from green context
+ via ::cuCtxFromGreenCtx(), \p hEvent will capture only the activities of the green context.
+
+ \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED if the
+ specified context \p hCtx has a stream in the capture mode. In such a case,
+ the call will invalidate all the conflicting captures.
+
+ \param hCtx - Context to record event for
+ \param hEvent - Event to record
+
+ \return
+ ::CUDA_SUCCESS
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
+
+ \sa
+ ::cuCtxWaitEvent,
+ ::cuGreenCtxRecordEvent,
+ ::cuGreenCtxWaitEvent,
+ ::cuEventRecord*/
+    fn cuCtxRecordEvent(
+        hCtx: cuda_types::cuda::CUcontext,
+        hEvent: cuda_types::cuda::CUevent,
+    ) -> cuda_types::cuda::CUresult;
+    /** \brief Make a context wait on an event
+
+ Makes all future work submitted to context \p hCtx wait for all work
+ captured in \p hEvent. The synchronization will be performed on the device
+ and will not block the calling CPU thread. See ::cuCtxRecordEvent()
+ for details on what is captured by an event.
+ If the context passed to \p hCtx is the primary context, the primary context
+ and its green contexts will wait for \p hEvent.
+ If the context passed to \p hCtx is a context converted from green context
+ via ::cuCtxFromGreenCtx(), the green context will wait for \p hEvent.
+
+ \note \p hEvent may be from a different context or device than \p hCtx.
+
+ \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED and
+ invalidate the capture if the specified event \p hEvent is part of an ongoing
+ capture sequence or if the specified context \p hCtx has a stream in the capture mode.
+
+ \param hCtx    - Context to wait
+ \param hEvent  - Event to wait on
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
+
+ \sa
+ ::cuCtxRecordEvent,
+ ::cuGreenCtxRecordEvent,
+ ::cuGreenCtxWaitEvent,
+ ::cuStreamWaitEvent*/
+    fn cuCtxWaitEvent(
+        hCtx: cuda_types::cuda::CUcontext,
+        hEvent: cuda_types::cuda::CUevent,
+    ) -> cuda_types::cuda::CUresult;
     /** \brief Increment a context's usage-count
 
  \deprecated
@@ -2494,6 +2748,11 @@ int textureAlign
  ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
  will accumulate data until the CUlinkState is destroyed.
 
+ The data passed in via ::cuLinkAddData and ::cuLinkAddFile will be treated
+ as relocatable (-rdc=true to nvcc) when linking the final cubin during
+ ::cuLinkComplete and will have similar consequences as offline relocatable
+ device code linking.
+
  \p optionValues must remain valid for the life of the CUlinkState if output
  options are used.  No other references to inputs are maintained after this
  call returns.
@@ -2739,6 +2998,7 @@ int textureAlign
 
  The \p code may be a \e cubin or \e fatbin as output by \b nvcc,
  or a NULL-terminated \e PTX, either as output by \b nvcc or hand-written.
+ A fatbin should also contain relocatable code when doing separate compilation.
 
  Options are passed as an array via \p jitOptions and any corresponding parameters are passed in
  \p jitOptionsValues. The number of total JIT options is supplied via \p numJitOptions.
@@ -2747,6 +3007,9 @@ int textureAlign
  Library load options are passed as an array via \p libraryOptions and any corresponding parameters are passed in
  \p libraryOptionValues. The number of total library load options is supplied via \p numLibraryOptions.
 
+ \note If the library contains managed variables and no device in the system
+ supports managed variables this call is expected to return ::CUDA_ERROR_NOT_SUPPORTED
+
  \param library             - Returned library
  \param code                - Code to load
  \param jitOptions          - Options for JIT
@@ -2767,7 +3030,8 @@ int textureAlign
  ::CUDA_ERROR_NO_BINARY_FOR_GPU,
  ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
  ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
- ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND,
+ ::CUDA_ERROR_NOT_SUPPORTED
 
  \sa ::cuLibraryLoadFromFile,
  ::cuLibraryUnload,
@@ -2800,6 +3064,7 @@ int textureAlign
 
  The file should be a \e cubin file as output by \b nvcc, or a \e PTX file either
  as output by \b nvcc or handwritten, or a \e fatbin file as output by \b nvcc.
+ A fatbin should also contain relocatable code when doing separate compilation.
 
  Options are passed as an array via \p jitOptions and any corresponding parameters are
  passed in \p jitOptionsValues. The number of total options is supplied via \p numJitOptions.
@@ -2808,6 +3073,9 @@ int textureAlign
  Library load options are passed as an array via \p libraryOptions and any corresponding parameters are passed in
  \p libraryOptionValues. The number of total library load options is supplied via \p numLibraryOptions.
 
+ \note If the library contains managed variables and no device in the system
+ supports managed variables this call is expected to return ::CUDA_ERROR_NOT_SUPPORTED
+
  \param library             - Returned library
  \param fileName            - File to load from
  \param jitOptions          - Options for JIT
@@ -2828,7 +3096,8 @@ int textureAlign
  ::CUDA_ERROR_NO_BINARY_FOR_GPU,
  ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
  ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
- ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND,
+ ::CUDA_ERROR_NOT_SUPPORTED
 
  \sa ::cuLibraryLoadData,
  ::cuLibraryUnload,
@@ -2980,6 +3249,29 @@ int textureAlign
         pFunc: *mut cuda_types::cuda::CUfunction,
         kernel: cuda_types::cuda::CUkernel,
     ) -> cuda_types::cuda::CUresult;
+    /** \brief Returns a library handle
+
+ Returns in \p pLib the handle of the library for the requested kernel \p kernel
+
+ \param pLib - Returned library handle
+ \param kernel - Kernel to retrieve library handle
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_NOT_FOUND
+
+ \sa ::cuLibraryLoadData,
+ ::cuLibraryLoadFromFile,
+ ::cuLibraryUnload,
+ ::cuLibraryGetKernel*/
+    fn cuKernelGetLibrary(
+        pLib: *mut cuda_types::cuda::CUlibrary,
+        kernel: cuda_types::cuda::CUkernel,
+    ) -> cuda_types::cuda::CUresult;
     /** \brief Returns a global device pointer
 
  Returns in \p *dptr and \p *bytes the base pointer and size of the global with
@@ -3023,9 +3315,6 @@ int textureAlign
  Note that managed memory for library \p library is shared across devices and is registered
  when the library is loaded into atleast one context.
 
- \note The API requires a CUDA context to be present and initialized on at least one device.
- If no context is present, the call returns ::CUDA_ERROR_NOT_FOUND.
-
  \param dptr - Returned pointer to the managed memory
  \param bytes - Returned memory size in bytes
  \param library - Library to retrieve managed memory from
@@ -3207,6 +3496,9 @@ int textureAlign
    positive. The validity of the cluster dimensions is checked at launch time.
    If the value is set during compile time, it cannot be set at runtime.
    Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ - ::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: Indicates whether
+   the function can be launched with non-portable cluster size. 1 is allowed,
+   0 is disallowed.
  - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block
    scheduling policy of a function. The value type is CUclusterSchedulingPolicy.
 
@@ -3501,9 +3793,10 @@ T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column;
  ::cuMemAllocPitch(), ::cuMemAllocManaged(), ::cuMemAllocAsync(), ::cuMemAllocFromPoolAsync()
 
  Note - This API will not perform any implict synchronization when the pointer was allocated with
- ::cuMemAllocAsync or ::cuMemAllocFromPoolAsync. Callers must ensure that all accesses to the
+ ::cuMemAllocAsync or ::cuMemAllocFromPoolAsync. Callers must ensure that all accesses to these
  pointer have completed before invoking ::cuMemFree. For best performance and memory reuse, users
  should use ::cuMemFreeAsync to free memory allocated via the stream ordered memory allocator.
+ For all other pointers, this API may perform implicit synchronization.
 
  \param dptr - Pointer to memory to free
 
@@ -4056,7 +4349,8 @@ T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column;
 
  IPC functionality is restricted to devices with support for unified
  addressing on Linux and Windows operating systems.
- IPC functionality on Windows is restricted to GPUs in TCC mode
+ IPC functionality on Windows is supported for compatibility purposes
+ but not recommended as it comes with performance cost.
  Users can test their device for IPC functionality by calling
  ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
 
@@ -4099,7 +4393,8 @@ T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column;
 
  IPC functionality is restricted to devices with support for unified
  addressing on Linux and Windows operating systems.
- IPC functionality on Windows is restricted to GPUs in TCC mode
+ IPC functionality on Windows is supported for compatibility purposes
+ but not recommended as it comes with performance cost.
  Users can test their device for IPC functionality by calling
  ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
 
@@ -4144,7 +4439,8 @@ T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column;
 
  IPC functionality is restricted to devices with support for unified
  addressing on Linux and Windows operating systems.
- IPC functionality on Windows is restricted to GPUs in TCC mode
+ IPC functionality on Windows is supported for compatibility purposes
+ but not recommended as it comes with performance cost.
  Users can test their device for IPC functionality by calling
  ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
 
@@ -4199,7 +4495,8 @@ T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column;
 
  IPC functionality is restricted to devices with support for unified
  addressing on Linux and Windows operating systems.
- IPC functionality on Windows is restricted to GPUs in TCC mode
+ IPC functionality on Windows is supported for compatibility purposes
+ but not recommended as it comes with performance cost.
  Users can test their device for IPC functionality by calling
  ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
 
@@ -4245,7 +4542,8 @@ T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column;
 
  IPC functionality is restricted to devices with support for unified
  addressing on Linux and Windows operating systems.
- IPC functionality on Windows is restricted to GPUs in TCC mode
+ IPC functionality on Windows is supported for compatibility purposes
+ but not recommended as it comes with performance cost.
  Users can test their device for IPC functionality by calling
  ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
 
@@ -5954,6 +6252,160 @@ CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
         pCopy: *const cuda_types::cuda::CUDA_MEMCPY3D_PEER,
         hStream: cuda_types::cuda::CUstream,
     ) -> cuda_types::cuda::CUresult;
+    /** \brief Performs a batch of memory copies asynchronously.
+
+ Performs a batch of memory copies. The batch as a whole executes in stream order but copies within a
+ batch are not guaranteed to execute in any specific order. This API only supports pointer-to-pointer copies.
+ For copies involving CUDA arrays, please see ::cuMemcpy3DBatchAsync.
+
+ Performs memory copies from source buffers specified in \p srcs to destination buffers specified in \p dsts.
+ The size of each copy is specified in \p sizes. All three arrays must be of the same length as specified
+ by \p count. Since there are no ordering guarantees for copies within a batch, specifying any dependent copies
+ within a batch will result in undefined behavior.
+
+ Every copy in the batch has to be associated with a set of attributes specified in the \p attrs array.
+ Each entry in this array can apply to more than one copy. This can be done by specifying in the \p attrsIdxs array,
+ the index of the first copy that the corresponding entry in the \p attrs array applies to. Both \p attrs and
+ \p attrsIdxs must be of the same length as specified by \p numAttrs. For example, if a batch has 10 copies listed
+ in dst/src/sizes, the first 6 of which have one set of attributes and the remaining 4 another, then \p numAttrs
+ will be 2, \p attrsIdxs will be {0, 6} and \p attrs will contains the two sets of attributes. Note that the first entry
+ in \p attrsIdxs must always be 0. Also, each entry must be greater than the previous entry and the last entry should be
+ less than \p count. Furthermore, \p numAttrs must be lesser than or equal to \p count.
+
+ The ::CUmemcpyAttributes::srcAccessOrder indicates the source access ordering to be observed for copies associated
+ with the attribute. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_STREAM, then the source will
+ be accessed in stream order. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_DURING_API_CALL then
+ it indicates that access to the source pointer can be out of stream order and all accesses must be complete before
+ the API call returns. This flag is suited for ephemeral sources (ex., stack variables) when it's known that no prior
+ operations in the stream can be accessing the memory and also that the lifetime of the memory is limited to the scope
+ that the source variable was declared in. Specifying this flag allows the driver to optimize the copy and removes the
+ need for the user to synchronize the stream after the API call. If the source access order is set to
+ ::CU_MEMCPY_SRC_ACCESS_ORDER_ANY then it indicates that access to the source pointer can be out of stream order and the
+ accesses can happen even after the API call returns. This flag is suited for host pointers allocated
+ outside CUDA (ex., via malloc) when it's known that no prior operations in the stream can be accessing the memory.
+ Specifying this flag allows the driver to optimize the copy on certain platforms. Each memcpy operation in the batch must
+ have a valid ::CUmemcpyAttributes corresponding to it including the appropriate srcAccessOrder setting, otherwise the API
+ will return ::CUDA_ERROR_INVALID_VALUE.
+
+ The ::CUmemcpyAttributes::srcLocHint and ::CUmemcpyAttributes::dstLocHint allows applications to specify hint locations
+ for operands of a copy when the operand doesn't have a fixed location. That is, these hints are
+ only applicable for managed memory pointers on devices where ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS is true or
+ system-allocated pageable memory on devices where ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS is true.
+ For other cases, these hints are ignored.
+
+ The ::CUmemcpyAttributes::flags field can be used to specify certain flags for copies. Setting the
+ ::CU_MEMCPY_FLAG_PREFER_OVERLAP_WITH_COMPUTE flag indicates that the associated copies should preferably overlap with
+ any compute work. Note that this flag is a hint and can be ignored depending on the platform and other parameters of the copy.
+
+ If any error is encountered while parsing the batch, the index within the batch where the error was encountered
+ will be returned in \p failIdx.
+
+ \param dsts          - Array of destination pointers.
+ \param srcs          - Array of memcpy source pointers.
+ \param sizes         - Array of sizes for memcpy operations.
+ \param count         - Size of \p dsts, \p srcs and \p sizes arrays
+ \param attrs         - Array of memcpy attributes.
+ \param attrsIdxs     - Array of indices to specify which copies each entry in the \p attrs array applies to.
+The attributes specified in attrs[k] will be applied to copies starting from attrsIdxs[k]
+through attrsIdxs[k+1] - 1. Also attrs[numAttrs-1] will apply to copies starting from
+attrsIdxs[numAttrs-1] through count - 1.
+ \param numAttrs      - Size of \p attrs and \p attrsIdxs arrays.
+ \param failIdx       - Pointer to a location to return the index of the copy where a failure was encountered.
+The value will be SIZE_MAX if the error doesn't pertain to any specific copy.
+ \param hStream       - The stream to enqueue the operations in. Must not be legacy NULL stream.
+
+ \return
+ ::CUDA_SUCCESS
+ ::CUDA_ERROR_DEINITIALIZED
+ ::CUDA_ERROR_NOT_INITIALIZED
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_async
+ \note_memcpy*/
+    fn cuMemcpyBatchAsync_ptsz(
+        dsts: *mut cuda_types::cuda::CUdeviceptr,
+        srcs: *mut cuda_types::cuda::CUdeviceptr,
+        sizes: *mut usize,
+        count: usize,
+        attrs: *mut cuda_types::cuda::CUmemcpyAttributes,
+        attrsIdxs: *mut usize,
+        numAttrs: usize,
+        failIdx: *mut usize,
+        hStream: cuda_types::cuda::CUstream,
+    ) -> cuda_types::cuda::CUresult;
+    /** \brief Performs a batch of 3D memory copies asynchronously.
+
+ Performs a batch of memory copies. The batch as a whole executes in stream order but copies within a
+ batch are not guaranteed to execute in any specific order. Note that this means specifying any dependent
+ copies within a batch will result in undefined behavior.
+
+ Performs memory copies as specified in the \p opList array. The length of this array is specified in \p numOps.
+ Each entry in this array describes a copy operation. This includes among other things, the source and destination
+ operands for the copy as specified in ::CUDA_MEMCPY3D_BATCH_OP::src and ::CUDA_MEMCPY3D_BATCH_OP::dst respectively.
+ The source and destination operands of a copy can either be a pointer or a CUDA array. The width, height and depth
+ of a copy is specified in ::CUDA_MEMCPY3D_BATCH_OP::extent. The width, height and depth of a copy are specified in
+ elements and must not be zero. For pointer-to-pointer copies, the element size is considered to be 1. For pointer
+ to CUDA array or vice versa copies, the element size is determined by the CUDA array. For CUDA array to CUDA array copies,
+ the element size of the two CUDA arrays must match.
+
+ For a given operand, if ::CUmemcpy3DOperand::type is specified as ::CU_MEMCPY_OPERAND_TYPE_POINTER, then
+ ::CUmemcpy3DOperand::op::ptr will be used. The ::CUmemcpy3DOperand::op::ptr::ptr field must contain the pointer where
+ the copy should begin. The ::CUmemcpy3DOperand::op::ptr::rowLength field specifies the length of each row in elements and
+ must either be zero or be greater than or equal to the width of the copy specified in ::CUDA_MEMCPY3D_BATCH_OP::extent::width.
+ The ::CUmemcpy3DOperand::op::ptr::layerHeight field specifies the height of each layer and must either be zero or be greater than
+ or equal to the height of the copy specified in ::CUDA_MEMCPY3D_BATCH_OP::extent::height. When either of these values is zero,
+ that aspect of the operand is considered to be tightly packed according to the copy extent. For managed memory pointers on devices where
+ ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS is true or system-allocated pageable memory on devices where
+ ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS is true, the ::CUmemcpy3DOperand::op::ptr::locHint field can be used to hint
+ the location of the operand.
+
+ If an operand's type is specified as ::CU_MEMCPY_OPERAND_TYPE_ARRAY, then ::CUmemcpy3DOperand::op::array will be used.
+ The ::CUmemcpy3DOperand::op::array::array field specifies the CUDA array and ::CUmemcpy3DOperand::op::array::offset specifies
+ the 3D offset into that array where the copy begins.
+
+ The ::CUmemcpyAttributes::srcAccessOrder indicates the source access ordering to be observed for copies associated
+ with the attribute. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_STREAM, then the source will
+ be accessed in stream order. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_DURING_API_CALL then
+ it indicates that access to the source pointer can be out of stream order and all accesses must be complete before
+ the API call returns. This flag is suited for ephemeral sources (ex., stack variables) when it's known that no prior
+ operations in the stream can be accessing the memory and also that the lifetime of the memory is limited to the scope
+ that the source variable was declared in. Specifying this flag allows the driver to optimize the copy and removes the
+ need for the user to synchronize the stream after the API call. If the source access order is set to
+ ::CU_MEMCPY_SRC_ACCESS_ORDER_ANY then it indicates that access to the source pointer can be out of stream order and the
+ accesses can happen even after the API call returns. This flag is suited for host pointers allocated
+ outside CUDA (ex., via malloc) when it's known that no prior operations in the stream can be accessing the memory.
+ Specifying this flag allows the driver to optimize the copy on certain platforms. Each memcopy operation in \p opList must
+ have a valid srcAccessOrder setting, otherwise this API will return ::CUDA_ERROR_INVALID_VALUE.
+
+ The ::CUmemcpyAttributes::flags field can be used to specify certain flags for copies. Setting the
+ ::CU_MEMCPY_FLAG_PREFER_OVERLAP_WITH_COMPUTE flag indicates that the associated copies should preferably overlap with
+ any compute work. Note that this flag is a hint and can be ignored depending on the platform and other parameters of the copy.
+
+ If any error is encountered while parsing the batch, the index within the batch where the error was encountered
+ will be returned in \p failIdx.
+
+ \param numOps     - Total number of memcpy operations.
+ \param opList     - Array of size \p numOps containing the actual memcpy operations.
+ \param failIdx    - Pointer to a location to return the index of the copy where a failure was encountered.
+                     The value will be SIZE_MAX if the error doesn't pertain to any specific copy.
+ \param flags      - Flags for future use, must be zero now.
+ \param hStream    - The stream to enqueue the operations in. Must not be default NULL stream.
+
+ \return
+ ::CUDA_SUCCESS
+ ::CUDA_ERROR_DEINITIALIZED
+ ::CUDA_ERROR_NOT_INITIALIZED
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_async
+ \note_memcpy*/
+    fn cuMemcpy3DBatchAsync_ptsz(
+        numOps: usize,
+        opList: *mut cuda_types::cuda::CUDA_MEMCPY3D_BATCH_OP,
+        failIdx: *mut usize,
+        flags: ::core::ffi::c_ulonglong,
+        hStream: cuda_types::cuda::CUstream,
+    ) -> cuda_types::cuda::CUresult;
     /** \brief Initializes device memory
 
  Sets the memory range of \p N 8-bit values to the specified value
@@ -6479,7 +6931,50 @@ CU_AD_FORMAT_SIGNED_INT8 = 0x08,
 CU_AD_FORMAT_SIGNED_INT16 = 0x09,
 CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
 CU_AD_FORMAT_HALF = 0x10,
-CU_AD_FORMAT_FLOAT = 0x20
+CU_AD_FORMAT_FLOAT = 0x20,
+CU_AD_FORMAT_NV12 = 0xb0,
+CU_AD_FORMAT_UNORM_INT8X1 = 0xc0,
+CU_AD_FORMAT_UNORM_INT8X2 = 0xc1,
+CU_AD_FORMAT_UNORM_INT8X4 = 0xc2,
+CU_AD_FORMAT_UNORM_INT16X1 = 0xc3,
+CU_AD_FORMAT_UNORM_INT16X2 = 0xc4,
+CU_AD_FORMAT_UNORM_INT16X4 = 0xc5,
+CU_AD_FORMAT_SNORM_INT8X1 = 0xc6,
+CU_AD_FORMAT_SNORM_INT8X2 = 0xc7,
+CU_AD_FORMAT_SNORM_INT8X4 = 0xc8,
+CU_AD_FORMAT_SNORM_INT16X1 = 0xc9,
+CU_AD_FORMAT_SNORM_INT16X2 = 0xca,
+CU_AD_FORMAT_SNORM_INT16X4 = 0xcb,
+CU_AD_FORMAT_BC1_UNORM = 0x91,
+CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92,
+CU_AD_FORMAT_BC2_UNORM = 0x93,
+CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94,
+CU_AD_FORMAT_BC3_UNORM = 0x95,
+CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96,
+CU_AD_FORMAT_BC4_UNORM = 0x97,
+CU_AD_FORMAT_BC4_SNORM = 0x98,
+CU_AD_FORMAT_BC5_UNORM = 0x99,
+CU_AD_FORMAT_BC5_SNORM = 0x9a,
+CU_AD_FORMAT_BC6H_UF16 = 0x9b,
+CU_AD_FORMAT_BC6H_SF16 = 0x9c,
+CU_AD_FORMAT_BC7_UNORM = 0x9d,
+CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e,
+CU_AD_FORMAT_P010 = 0x9f,
+CU_AD_FORMAT_P016 = 0xa1,
+CU_AD_FORMAT_NV16 = 0xa2,
+CU_AD_FORMAT_P210 = 0xa3,
+CU_AD_FORMAT_P216 = 0xa4,
+CU_AD_FORMAT_YUY2 = 0xa5,
+CU_AD_FORMAT_Y210 = 0xa6,
+CU_AD_FORMAT_Y216 = 0xa7,
+CU_AD_FORMAT_AYUV = 0xa8,
+CU_AD_FORMAT_Y410 = 0xa9,
+CU_AD_FORMAT_Y416 = 0xb1,
+CU_AD_FORMAT_Y444_PLANAR8 = 0xb2,
+CU_AD_FORMAT_Y444_PLANAR10 = 0xb3,
+CU_AD_FORMAT_YUV444_8bit_SemiPlanar = 0xb4,
+CU_AD_FORMAT_YUV444_16bit_SemiPlanar = 0xb5,
+CU_AD_FORMAT_UNORM_INT_101010_2 = 0x50,
 } CUarray_format;
   \endcode
  - \p NumChannels specifies the number of packed components per CUDA array
@@ -6799,7 +7294,50 @@ CU_AD_FORMAT_SIGNED_INT8 = 0x08,
 CU_AD_FORMAT_SIGNED_INT16 = 0x09,
 CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
 CU_AD_FORMAT_HALF = 0x10,
-CU_AD_FORMAT_FLOAT = 0x20
+CU_AD_FORMAT_FLOAT = 0x20,
+CU_AD_FORMAT_NV12 = 0xb0,
+CU_AD_FORMAT_UNORM_INT8X1 = 0xc0,
+CU_AD_FORMAT_UNORM_INT8X2 = 0xc1,
+CU_AD_FORMAT_UNORM_INT8X4 = 0xc2,
+CU_AD_FORMAT_UNORM_INT16X1 = 0xc3,
+CU_AD_FORMAT_UNORM_INT16X2 = 0xc4,
+CU_AD_FORMAT_UNORM_INT16X4 = 0xc5,
+CU_AD_FORMAT_SNORM_INT8X1 = 0xc6,
+CU_AD_FORMAT_SNORM_INT8X2 = 0xc7,
+CU_AD_FORMAT_SNORM_INT8X4 = 0xc8,
+CU_AD_FORMAT_SNORM_INT16X1 = 0xc9,
+CU_AD_FORMAT_SNORM_INT16X2 = 0xca,
+CU_AD_FORMAT_SNORM_INT16X4 = 0xcb,
+CU_AD_FORMAT_BC1_UNORM = 0x91,
+CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92,
+CU_AD_FORMAT_BC2_UNORM = 0x93,
+CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94,
+CU_AD_FORMAT_BC3_UNORM = 0x95,
+CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96,
+CU_AD_FORMAT_BC4_UNORM = 0x97,
+CU_AD_FORMAT_BC4_SNORM = 0x98,
+CU_AD_FORMAT_BC5_UNORM = 0x99,
+CU_AD_FORMAT_BC5_SNORM = 0x9a,
+CU_AD_FORMAT_BC6H_UF16 = 0x9b,
+CU_AD_FORMAT_BC6H_SF16 = 0x9c,
+CU_AD_FORMAT_BC7_UNORM = 0x9d,
+CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e,
+CU_AD_FORMAT_P010 = 0x9f,
+CU_AD_FORMAT_P016 = 0xa1,
+CU_AD_FORMAT_NV16 = 0xa2,
+CU_AD_FORMAT_P210 = 0xa3,
+CU_AD_FORMAT_P216 = 0xa4,
+CU_AD_FORMAT_YUY2 = 0xa5,
+CU_AD_FORMAT_Y210 = 0xa6,
+CU_AD_FORMAT_Y216 = 0xa7,
+CU_AD_FORMAT_AYUV = 0xa8,
+CU_AD_FORMAT_Y410 = 0xa9,
+CU_AD_FORMAT_Y416 = 0xb1,
+CU_AD_FORMAT_Y444_PLANAR8 = 0xb2,
+CU_AD_FORMAT_Y444_PLANAR10 = 0xb3,
+CU_AD_FORMAT_YUV444_8bit_SemiPlanar = 0xb4,
+CU_AD_FORMAT_YUV444_16bit_SemiPlanar = 0xb5,
+CU_AD_FORMAT_UNORM_INT_101010_2 = 0x50,
 } CUarray_format;
   \endcode
 
@@ -7020,7 +7558,50 @@ CU_AD_FORMAT_SIGNED_INT8 = 0x08,
 CU_AD_FORMAT_SIGNED_INT16 = 0x09,
 CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
 CU_AD_FORMAT_HALF = 0x10,
-CU_AD_FORMAT_FLOAT = 0x20
+CU_AD_FORMAT_FLOAT = 0x20,
+CU_AD_FORMAT_NV12 = 0xb0,
+CU_AD_FORMAT_UNORM_INT8X1 = 0xc0,
+CU_AD_FORMAT_UNORM_INT8X2 = 0xc1,
+CU_AD_FORMAT_UNORM_INT8X4 = 0xc2,
+CU_AD_FORMAT_UNORM_INT16X1 = 0xc3,
+CU_AD_FORMAT_UNORM_INT16X2 = 0xc4,
+CU_AD_FORMAT_UNORM_INT16X4 = 0xc5,
+CU_AD_FORMAT_SNORM_INT8X1 = 0xc6,
+CU_AD_FORMAT_SNORM_INT8X2 = 0xc7,
+CU_AD_FORMAT_SNORM_INT8X4 = 0xc8,
+CU_AD_FORMAT_SNORM_INT16X1 = 0xc9,
+CU_AD_FORMAT_SNORM_INT16X2 = 0xca,
+CU_AD_FORMAT_SNORM_INT16X4 = 0xcb,
+CU_AD_FORMAT_BC1_UNORM = 0x91,
+CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92,
+CU_AD_FORMAT_BC2_UNORM = 0x93,
+CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94,
+CU_AD_FORMAT_BC3_UNORM = 0x95,
+CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96,
+CU_AD_FORMAT_BC4_UNORM = 0x97,
+CU_AD_FORMAT_BC4_SNORM = 0x98,
+CU_AD_FORMAT_BC5_UNORM = 0x99,
+CU_AD_FORMAT_BC5_SNORM = 0x9a,
+CU_AD_FORMAT_BC6H_UF16 = 0x9b,
+CU_AD_FORMAT_BC6H_SF16 = 0x9c,
+CU_AD_FORMAT_BC7_UNORM = 0x9d,
+CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e,
+CU_AD_FORMAT_P010 = 0x9f,
+CU_AD_FORMAT_P016 = 0xa1,
+CU_AD_FORMAT_NV16 = 0xa2,
+CU_AD_FORMAT_P210 = 0xa3,
+CU_AD_FORMAT_P216 = 0xa4,
+CU_AD_FORMAT_YUY2 = 0xa5,
+CU_AD_FORMAT_Y210 = 0xa6,
+CU_AD_FORMAT_Y216 = 0xa7,
+CU_AD_FORMAT_AYUV = 0xa8,
+CU_AD_FORMAT_Y410 = 0xa9,
+CU_AD_FORMAT_Y416 = 0xb1,
+CU_AD_FORMAT_Y444_PLANAR8 = 0xb2,
+CU_AD_FORMAT_Y444_PLANAR10 = 0xb3,
+CU_AD_FORMAT_YUV444_8bit_SemiPlanar = 0xb4,
+CU_AD_FORMAT_YUV444_16bit_SemiPlanar = 0xb5,
+CU_AD_FORMAT_UNORM_INT_101010_2 = 0x50,
 } CUarray_format;
   \endcode
 
@@ -7184,11 +7765,17 @@ CU_AD_FORMAT_FLOAT = 0x20
  new handle every time the underlying physical allocation(s) corresponding
  to a previously queried VA range are changed.
 
+ For CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, users may set
+ flags to ::CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE. Which when set on a
+ supported platform, will give a DMA_BUF handle mapped via PCIE BAR1 or will
+ return an error otherwise.
+
  \param[out] handle     - Pointer to the location where the returned handle will be stored.
  \param[in] dptr        - Pointer to a valid CUDA device allocation. Must be aligned to host page size.
  \param[in] size        - Length of the address range. Must be aligned to host page size.
  \param[in] handleType  - Type of handle requested (defines type and size of the \p handle output parameter)
- \param[in] flags       - Reserved, must be zero
+ \param[in] flags       - When requesting CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD the value could be
+                          ::CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE, otherwise 0.
 
  \return
  CUDA_SUCCESS
@@ -7201,6 +7788,69 @@ CU_AD_FORMAT_FLOAT = 0x20
         handleType: cuda_types::cuda::CUmemRangeHandleType,
         flags: ::core::ffi::c_ulonglong,
     ) -> cuda_types::cuda::CUresult;
+    /** \brief   Submit a batch of \p count independent decompression operations.
+
+ \details Each of the \p count decompression operations is described by a
+          single entry in the \p paramsArray array. Once the batch has been
+          submitted, the function will return, and decompression will happen
+          asynchronously w.r.t. the CPU. To the work completion tracking
+          mechanisms in the CUDA driver, the batch will be considered a single
+          unit of work and processed according to stream semantics, i.e., it
+          is not possible to query the completion of individual decompression
+          operations within a batch.
+
+          The memory pointed to by each of ::CUmemDecompressParams.src,
+          ::CUmemDecompressParams.dst, and ::CUmemDecompressParams.dstActBytes,
+          must be capable of usage with the hardware decompress feature. That
+          is, for each of said pointers, the pointer attribute
+          ::CU_POINTER_ATTRIBUTE_IS_MEM_DECOMPRESS_CAPABLE should give a
+          non-zero value. To ensure this, the memory backing the pointers
+          should have been allocated using one of the following CUDA memory
+          allocators:
+          * ::cuMemAlloc()
+          * ::cuMemCreate() with the usage flag ::CU_MEM_CREATE_USAGE_HW_DECOMPRESS
+          * ::cuMemAllocFromPoolAsync() from a pool that was created with
+            the usage flag ::CU_MEM_POOL_CREATE_USAGE_HW_DECOMPRESS
+          Additionally, ::CUmemDecompressParams.src, ::CUmemDecompressParams.dst,
+          and ::CUmemDecompressParams.dstActBytes, must all be accessible from
+          the device associated with the context where \p stream was created.
+          For information on how to ensure this, see the documentation for the
+          allocator of interest.
+
+ \param[in]  paramsArray  The array of structures describing the independent
+                          decompression operations.
+ \param[in]  count        The number of entries in \p paramsArray array.
+ \param[in]  flags        Must be 0.
+ \param[out] errorIndex   The index into \p paramsArray of the decompression
+                          operation for which the error returned by this
+                          function pertains to. If \p index is SIZE_MAX and
+                          the value returned is not ::CUDA_SUCCESS, then the
+                          error returned by this function should be considered
+                          a general error that does not pertain to a
+                          particular decompression operation. May be \p NULL,
+                          in which case, no index will be recorded in the
+                          event of error.
+ \param[in]  stream       The stream where the work will be enqueued.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+ \note_async
+ \note_null_stream
+
+ \sa ::cuMemAlloc, ::cuMemPoolCreate, ::cuMemAllocFromPoolAsync*/
+    fn cuMemBatchDecompressAsync_ptsz(
+        paramsArray: *mut cuda_types::cuda::CUmemDecompressParams,
+        count: usize,
+        flags: ::core::ffi::c_uint,
+        errorIndex: *mut usize,
+        stream: cuda_types::cuda::CUstream,
+    ) -> cuda_types::cuda::CUresult;
     /** \brief Allocate an address range reservation.
 
  Reserves a virtual address range based on the given parameters, giving
@@ -7270,17 +7920,23 @@ CU_AD_FORMAT_FLOAT = 0x20
  set ::CUmemAllocationProp::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and
  ::CUmemAllocationProp::CUmemLocation::id must specify the NUMA ID of the CPU.
  On systems where NUMA is not available ::CUmemAllocationProp::CUmemLocation::id must be set to 0.
+ Specifying ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT or ::CU_MEM_LOCATION_TYPE_HOST as the
+ ::CUmemLocation::type will result in ::CUDA_ERROR_INVALID_VALUE.
 
- Applications can set ::CUmemAllocationProp::requestedHandleTypes to
- ::CU_MEM_HANDLE_TYPE_FABRIC in order to create allocations suitable for sharing
- within an IMEX domain. An IMEX domain is either an OS instance or a group of securely
- connected OS instances using the NVIDIA IMEX daemon. An IMEX channel is a global resource
- within the IMEX domain that represents a logical entity that aims to provide fine grained
- accessibility control for the participating processes. When exporter and importer CUDA processes
- have been granted access to the same IMEX channel, they can securely share memory.
- If the allocating process does not have access setup for an IMEX channel, attempting to create
- a ::CUmemGenericAllocationHandle with ::CU_MEM_HANDLE_TYPE_FABRIC will result in ::CUDA_ERROR_NOT_PERMITTED.
- The nvidia-modprobe CLI provides more information regarding setting up of IMEX channels.
+ Applications that intend to use ::CU_MEM_HANDLE_TYPE_FABRIC based memory sharing must ensure:
+ (1) `nvidia-caps-imex-channels` character device is created by the driver and is listed under /proc/devices
+ (2) have at least one IMEX channel file accessible by the user launching the application.
+
+ When exporter and importer CUDA processes have been granted access to the same IMEX channel, they can securely
+ share memory.
+
+ The IMEX channel security model works on a per user basis. Which means all processes under a user can share
+ memory if the user has access to a valid IMEX channel. When multi-user isolation is desired, a separate IMEX
+ channel is required for each user.
+
+ These channel files exist in /dev/nvidia-caps-imex-channels/channel* and can be created using standard OS
+ native calls like mknod on Linux. For example: To create channel0 with the major number from /proc/devices
+ users can execute the following command: `mknod /dev/nvidia-caps-imex-channels/channel0 c <major number> 0`
 
  If ::CUmemAllocationProp::allocFlags::usage contains ::CU_MEM_CREATE_USAGE_TILE_POOL flag then
  the memory allocation is intended only to be used as backing tile pool for sparse CUDA arrays
@@ -7955,22 +8611,28 @@ CU_MEM_OPERATION_TYPE_UNMAP = 2
  To create a memory pool targeting a specific host NUMA node, applications must
  set ::CUmemPoolProps::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and
  ::CUmemPoolProps::CUmemLocation::id must specify the NUMA ID of the host memory node.
+ Specifying ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT or ::CU_MEM_LOCATION_TYPE_HOST as the
+ ::CUmemPoolProps::CUmemLocation::type will result in ::CUDA_ERROR_INVALID_VALUE.
  By default, the pool's memory will be accessible from the device it is allocated on.
  In the case of pools created with ::CU_MEM_LOCATION_TYPE_HOST_NUMA, their default accessibility
  will be from the host CPU.
  Applications can control the maximum size of the pool by specifying a non-zero value for ::CUmemPoolProps::maxSize.
  If set to 0, the maximum size of the pool will default to a system dependent value.
 
- Applications can set ::CUmemPoolProps::handleTypes to ::CU_MEM_HANDLE_TYPE_FABRIC
- in order to create ::CUmemoryPool suitable for sharing within an IMEX domain.
- An IMEX domain is either an OS instance or a group of securely connected OS instances
- using the NVIDIA IMEX daemon. An IMEX channel is a global resource within the IMEX domain
- that represents a logical entity that aims to provide fine grained accessibility control
- for the participating processes. When exporter and importer CUDA processes have been
- granted access to the same IMEX channel, they can securely share memory.
- If the allocating process does not have access setup for an IMEX channel, attempting to export
- a ::CUmemoryPool with ::CU_MEM_HANDLE_TYPE_FABRIC will result in ::CUDA_ERROR_NOT_PERMITTED.
- The nvidia-modprobe CLI provides more information regarding setting up of IMEX channels.
+ Applications that intend to use ::CU_MEM_HANDLE_TYPE_FABRIC based memory sharing must ensure:
+ (1) `nvidia-caps-imex-channels` character device is created by the driver and is listed under /proc/devices
+ (2) have at least one IMEX channel file accessible by the user launching the application.
+
+ When exporter and importer CUDA processes have been granted access to the same IMEX channel, they can securely
+ share memory.
+
+ The IMEX channel security model works on a per user basis. Which means all processes under a user can share
+ memory if the user has access to a valid IMEX channel. When multi-user isolation is desired, a separate IMEX
+ channel is required for each user.
+
+ These channel files exist in /dev/nvidia-caps-imex-channels/channel* and can be created using standard OS
+ native calls like mknod on Linux. For example: To create channel0 with the major number from /proc/devices
+ users can execute the following command: `mknod /dev/nvidia-caps-imex-channels/channel0 c <major number> 0`
 
  \note Specifying CU_MEM_HANDLE_TYPE_NONE creates a memory pool that will not support IPC.
 
@@ -8251,8 +8913,8 @@ CU_MEM_OPERATION_TYPE_UNMAP = 2
  returned by ::cuMulticastGetGranularity with the flag
  ::CU_MULTICAST_GRANULARITY_RECOMMENDED.
 
- The \p size + \p memOffset must be smaller than the size of the allocated
- memory. Similarly the \p size + \p mcOffset must be smaller than the size
+ The \p size + \p memOffset cannot be larger than the size of the allocated
+ memory. Similarly the \p size + \p mcOffset cannot be larger than the size
  of the multicast object.
  The memory allocation must have beeen created on one of the devices
  that was added to the multicast team via ::cuMulticastAddDevice.
@@ -8303,8 +8965,8 @@ CU_MEM_OPERATION_TYPE_UNMAP = 2
  aligned to the value returned by ::cuMulticastGetGranularity with the flag
  ::CU_MULTICAST_GRANULARITY_RECOMMENDED.
 
- The \p size must be smaller than the size of the allocated memory.
- Similarly the \p size + \p mcOffset must be smaller than the total size
+ The \p size cannot be larger than the size of the allocated memory.
+ Similarly the \p size + \p mcOffset cannot be larger than the total size
  of the multicast object.
  The memory allocation must have beeen created on one of the devices
  that was added to the multicast team via ::cuMulticastAddDevice.
@@ -8348,7 +9010,7 @@ CU_MEM_OPERATION_TYPE_UNMAP = 2
  The intended \p size of the unbind and the offset in the multicast range
  ( \p mcOffset ) must be a multiple of the value returned by
  ::cuMulticastGetGranularity flag ::CU_MULTICAST_GRANULARITY_MINIMUM.
- The \p size + \p mcOffset must be smaller than the total size of the
+ The \p size + \p mcOffset cannot be larger than the total size of the
  multicast object.
 
  \note
@@ -8547,6 +9209,12 @@ CU_MEM_OPERATION_TYPE_UNMAP = 2
 
       Returns in \p *data the handle to the mempool that the allocation was obtained from.
 
+ - ::CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE:
+
+      Returns in \p *data a boolean that indicates whether the pointer points
+      to memory that is capable to be used for hardware accelerated
+      decompression.
+
  \par
 
  Note that for most allocations in the unified virtual address space
@@ -8602,7 +9270,9 @@ CU_MEM_OPERATION_TYPE_UNMAP = 2
  base device pointer of the memory to be prefetched and \p dstDevice is the
  destination device. \p count specifies the number of bytes to copy. \p hStream
  is the stream in which the operation is enqueued. The memory range must refer
- to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables.
+ to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables
+ or it may also refer to system-allocated memory on systems with non-zero
+ CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
 
  Passing in CU_DEVICE_CPU for \p dstDevice will prefetch the data to host memory. If
  \p dstDevice is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
@@ -9179,6 +9849,7 @@ CU_MEM_OPERATION_TYPE_UNMAP = 2
  - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE
  - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES
  - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE
+ - ::CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE
 
  \param numAttributes - Number of attributes to query
  \param attributes    - An array of attributes to query
@@ -9237,8 +9908,10 @@ CU_MEM_OPERATION_TYPE_UNMAP = 2
 
  \sa ::cuStreamDestroy,
  ::cuStreamCreateWithPriority,
+ ::cuGreenCtxStreamCreate,
  ::cuStreamGetPriority,
  ::cuStreamGetFlags,
+ ::cuStreamGetDevice
  ::cuStreamWaitEvent,
  ::cuStreamQuery,
  ::cuStreamSynchronize,
@@ -9288,9 +9961,11 @@ CU_MEM_OPERATION_TYPE_UNMAP = 2
 
  \sa ::cuStreamDestroy,
  ::cuStreamCreate,
+ ::cuGreenCtxStreamCreate,
  ::cuStreamGetPriority,
  ::cuCtxGetStreamPriorityRange,
  ::cuStreamGetFlags,
+ ::cuStreamGetDevice
  ::cuStreamWaitEvent,
  ::cuStreamQuery,
  ::cuStreamSynchronize,
@@ -9303,7 +9978,7 @@ CU_MEM_OPERATION_TYPE_UNMAP = 2
     ) -> cuda_types::cuda::CUresult;
     /** \brief Query the priority of a given stream
 
- Query the priority of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority
+ Query the priority of a stream created using ::cuStreamCreate, ::cuStreamCreateWithPriority or ::cuGreenCtxStreamCreate
  and return the priority in \p priority. Note that if the stream was created with a
  priority outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
  this function returns the clamped priority.
@@ -9324,16 +9999,44 @@ CU_MEM_OPERATION_TYPE_UNMAP = 2
  \sa ::cuStreamDestroy,
  ::cuStreamCreate,
  ::cuStreamCreateWithPriority,
+ ::cuGreenCtxStreamCreate,
  ::cuCtxGetStreamPriorityRange,
  ::cuStreamGetFlags,
+ ::cuStreamGetDevice
  ::cudaStreamGetPriority*/
     fn cuStreamGetPriority_ptsz(
         hStream: cuda_types::cuda::CUstream,
         priority: *mut ::core::ffi::c_int,
     ) -> cuda_types::cuda::CUresult;
+    /** \brief Returns the device handle of the stream
+
+ Returns in \p *device the device handle of the stream
+
+ \param hStream - Handle to the stream to be queried
+ \param device - Returns the device to which a stream belongs
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+ \notefnerr
+
+ \sa
+ ::cuStreamDestroy,
+ ::cuStreamCreate,
+ ::cuGreenCtxStreamCreate,
+ ::cuStreamGetFlags*/
+    fn cuStreamGetDevice_ptsz(
+        hStream: cuda_types::cuda::CUstream,
+        device: *mut cuda_types::cuda::CUdevice,
+    ) -> cuda_types::cuda::CUresult;
     /** \brief Query the flags of a given stream
 
- Query the flags of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority
+ Query the flags of a stream created using ::cuStreamCreate, ::cuStreamCreateWithPriority or ::cuGreenCtxStreamCreate
  and return the flags in \p flags.
 
  \param hStream    - Handle to the stream to be queried
@@ -9353,8 +10056,10 @@ CU_MEM_OPERATION_TYPE_UNMAP = 2
 
  \sa ::cuStreamDestroy,
  ::cuStreamCreate,
+ ::cuGreenCtxStreamCreate,
  ::cuStreamGetPriority,
- ::cudaStreamGetFlags*/
+ ::cudaStreamGetFlags
+ ::cuStreamGetDevice*/
     fn cuStreamGetFlags_ptsz(
         hStream: cuda_types::cuda::CUstream,
         flags: *mut ::core::ffi::c_uint,
@@ -9396,6 +10101,10 @@ CU_MEM_OPERATION_TYPE_UNMAP = 2
 
  Returns the CUDA context that the stream is associated with.
 
+ Note there is a later version of this API, ::cuStreamGetCtx_v2. It will
+ supplant this version in CUDA 13.0. It is recommended to use ::cuStreamGetCtx_v2
+ till then as this version will return ::CUDA_ERROR_NOT_SUPPORTED for streams created via the API ::cuGreenCtxStreamCreate.
+
  The stream handle \p hStream can refer to any of the following:
  <ul>
    <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate
@@ -9420,22 +10129,84 @@ CU_MEM_OPERATION_TYPE_UNMAP = 2
  ::CUDA_ERROR_NOT_INITIALIZED,
  ::CUDA_ERROR_INVALID_CONTEXT,
  ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_NOT_SUPPORTED
  \notefnerr
 
  \sa ::cuStreamDestroy,
  ::cuStreamCreateWithPriority,
  ::cuStreamGetPriority,
  ::cuStreamGetFlags,
+ ::cuStreamGetDevice
  ::cuStreamWaitEvent,
  ::cuStreamQuery,
  ::cuStreamSynchronize,
  ::cuStreamAddCallback,
  ::cudaStreamCreate,
+ ::cuStreamGetCtx_v2,
  ::cudaStreamCreateWithFlags*/
     fn cuStreamGetCtx_ptsz(
         hStream: cuda_types::cuda::CUstream,
         pctx: *mut cuda_types::cuda::CUcontext,
     ) -> cuda_types::cuda::CUresult;
+    /** \brief Query the contexts associated with a stream
+
+ Returns the contexts that the stream is associated with.
+
+ If the stream is associated with a green context, the API returns the green context in \p pGreenCtx
+ and the primary context of the associated device in \p pCtx.
+
+ If the stream is associated with a regular context, the API returns the regular context in \p pCtx
+ and NULL in \p pGreenCtx.
+
+ The stream handle \p hStream can refer to any of the following:
+ <ul>
+   <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate,
+   ::cuStreamCreateWithPriority and ::cuGreenCtxStreamCreate, or their runtime API equivalents such as
+   ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority.
+   Passing an invalid handle will result in undefined behavior.</li>
+   <li>any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and
+   ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted,
+   which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively.
+   If any of the special handles are specified, the API will operate on the context current to the
+   calling thread. If a green context (that was converted via ::cuCtxFromGreenCtx() before setting it current)
+   is current to the calling thread, the API will return the green context in \p pGreenCtx
+   and the primary context of the associated device in \p pCtx. If a regular context is current,
+   the API returns the regular context in \p pCtx and NULL in \p pGreenCtx.
+   Note that specifying ::CU_STREAM_PER_THREAD or ::cudaStreamPerThread will return ::CUDA_ERROR_INVALID_HANDLE
+   if a green context is current to the calling thread.
+   If no context is current to the calling thread, ::CUDA_ERROR_INVALID_CONTEXT is returned.</li>
+ </ul>
+
+ \param hStream   - Handle to the stream to be queried
+ \param pCtx      - Returned regular context associated with the stream
+ \param pGreenCtx - Returned green context if the stream is associated with a green context or NULL if not
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+
+ \sa ::cuStreamDestroy,
+ ::cuStreamCreate
+ ::cuStreamCreateWithPriority,
+ ::cuGreenCtxStreamCreate,
+ ::cuStreamGetPriority,
+ ::cuStreamGetFlags,
+ ::cuStreamGetDevice
+ ::cuStreamWaitEvent,
+ ::cuStreamQuery,
+ ::cuStreamSynchronize,
+ ::cuStreamAddCallback,
+ ::cudaStreamCreate,
+ ::cudaStreamCreateWithFlags,*/
+    fn cuStreamGetCtx_v2_ptsz(
+        hStream: cuda_types::cuda::CUstream,
+        pCtx: *mut cuda_types::cuda::CUcontext,
+        pGreenCtx: *mut cuda_types::cuda::CUgreenCtx,
+    ) -> cuda_types::cuda::CUresult;
     /** \brief Make a compute stream wait on an event
 
  Makes all future work submitted to \p hStream wait for all work captured in
@@ -10241,7 +11012,8 @@ cuThreadExchangeStreamCaptureMode(&mode); // restore previous mode
     /** \brief Records an event
 
  Captures in \p hEvent the contents of \p hStream at the time of this call.
- \p hEvent and \p hStream must be from the same context.
+ \p hEvent and \p hStream must be from the same context otherwise
+ ::CUDA_ERROR_INVALID_HANDLE is returned.
  Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then
  examine or wait for completion of the work that was captured. Uses of
  \p hStream after this call do not modify \p hEvent. See note on default
@@ -10283,7 +11055,8 @@ cuThreadExchangeStreamCaptureMode(&mode); // restore previous mode
     /** \brief Records an event
 
  Captures in \p hEvent the contents of \p hStream at the time of this call.
- \p hEvent and \p hStream must be from the same context.
+ \p hEvent and \p hStream must be from the same context otherwise
+ ::CUDA_ERROR_INVALID_HANDLE is returned.
  Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then
  examine or wait for completion of the work that was captured. Uses of
  \p hStream after this call do not modify \p hEvent. See note on default
@@ -10438,6 +11211,9 @@ cuThreadExchangeStreamCaptureMode(&mode); // restore previous mode
  the ::CU_EVENT_DISABLE_TIMING flag, then this function will return
  ::CUDA_ERROR_INVALID_HANDLE.
 
+ Note there is a later version of this API, ::cuEventElapsedTime_v2. It will
+ supplant this version in CUDA 13.0, which is retained for minor version compatibility.
+
  \param pMilliseconds - Time between \p hStart and \p hEnd in ms
  \param hStart        - Starting event
  \param hEnd          - Ending event
@@ -10463,6 +11239,55 @@ cuThreadExchangeStreamCaptureMode(&mode); // restore previous mode
         hStart: cuda_types::cuda::CUevent,
         hEnd: cuda_types::cuda::CUevent,
     ) -> cuda_types::cuda::CUresult;
+    /** \brief Computes the elapsed time between two events
+
+ Computes the elapsed time between two events (in milliseconds with a
+ resolution of around 0.5 microseconds). Note this API is not guaranteed
+ to return the latest errors for pending work. As such this API is intended to
+ serve as an elapsed time calculation only and any polling for completion on the
+ events to be compared should be done with ::cuEventQuery instead.
+
+ If either event was last recorded in a non-NULL stream, the resulting time
+ may be greater than expected (even if both used the same stream handle). This
+ happens because the ::cuEventRecord() operation takes place asynchronously
+ and there is no guarantee that the measured latency is actually just between
+ the two events. Any number of other different stream operations could execute
+ in between the two measured events, thus altering the timing in a significant
+ way.
+
+ If ::cuEventRecord() has not been called on either event then
+ ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called
+ on both events but one or both of them has not yet been completed (that is,
+ ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the
+ events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with
+ the ::CU_EVENT_DISABLE_TIMING flag, then this function will return
+ ::CUDA_ERROR_INVALID_HANDLE.
+
+ \param pMilliseconds - Time between \p hStart and \p hEnd in ms
+ \param hStart        - Starting event
+ \param hEnd          - Ending event
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_NOT_READY,
+ ::CUDA_ERROR_UNKNOWN
+ \notefnerr
+
+ \sa ::cuEventCreate,
+ ::cuEventRecord,
+ ::cuEventQuery,
+ ::cuEventSynchronize,
+ ::cuEventDestroy,
+ ::cudaEventElapsedTime*/
+    fn cuEventElapsedTime_v2(
+        pMilliseconds: *mut f32,
+        hStart: cuda_types::cuda::CUevent,
+        hEnd: cuda_types::cuda::CUevent,
+    ) -> cuda_types::cuda::CUresult;
     /** \brief Imports an external memory object
 
  Imports an externally allocated memory object and returns
@@ -10501,7 +11326,7 @@ CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP         = 4,
 CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE     = 5,
 CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE     = 6,
 CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7,
-CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF           = 8
+CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF           = 8,
 } CUexternalMemoryHandleType;
  \endcode
 
@@ -10716,6 +11541,7 @@ unsigned int numLevels;
  If \p extMem was imported from a handle of type ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then
  ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels must be equal to 1.
 
+
  The returned CUDA mipmapped array must be freed using ::cuMipmappedArrayDestroy.
 
  \param mipmap     - Returned CUDA mipmapped array
@@ -11432,6 +12258,9 @@ CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10
    positive. The validity of the cluster dimensions is checked at launch time.
    If the value is set during compile time, it cannot be set at runtime.
    Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ - ::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: Indicates whether
+   the function can be launched with non-portable cluster size. 1 is allowed,
+   0 is disallowed.
  - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block
    scheduling policy of a function. The value type is CUclusterSchedulingPolicy.
 
@@ -11823,6 +12652,7 @@ status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config);
      CU_LAUNCH_ATTRIBUTE_PRIORITY               = 8,
      CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP    = 9,
      CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN        = 10,
+     CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION = 11,
      CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT = 12,
      CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE = 13,
  } CUlaunchAttributeID;
@@ -11849,6 +12679,11 @@ status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config);
      int priority;
      CUlaunchMemSyncDomainMap memSyncDomainMap;
      CUlaunchMemSyncDomain memSyncDomain;
+     struct {
+         unsigned int x;
+         unsigned int y;
+         unsigned int z;
+     } preferredClusterDim;
      struct {
          CUevent event;
          int flags;
@@ -11920,6 +12755,36 @@ status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config);
  opt out, and any attempt to set the attribute to 0 will result in an error. Graphs
  containing one or more device-updatable node also do not allow multiple instantiation.
 
+ ::CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION allows the kernel launch to
+ specify a preferred substitute cluster dimension. Blocks may be grouped
+ according to either the dimensions specified with this attribute (grouped
+ into a "preferred substitute cluster"), or the one specified with
+ ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION attribute (grouped into a "regular
+ cluster"). The cluster dimensions of a "preferred substitute cluster" shall
+ be an integer multiple greater than zero of the regular cluster dimensions.
+ The device will attempt - on a best-effort basis - to group thread blocks
+ into preferred clusters over grouping them into regular clusters. When it
+ deems necessary (primarily when the device temporarily runs out of physical
+ resources to launch the larger preferred clusters), the device may switch to
+ launch the regular clusters instead to attempt to utilize as much of the
+ physical device resources as possible.
+
+ Each type of cluster will have its enumeration / coordinate setup as if the
+ grid consists solely of its type of cluster. For example, if the preferred
+ substitute cluster dimensions double the regular cluster dimensions, there
+ might be simultaneously a regular cluster indexed at (1,0,0), and a preferred
+ cluster indexed at (1,0,0). In this example, the preferred substitute cluster
+ (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their
+ blocks.
+
+ This attribute will only take effect when a regular cluster dimension has
+ been specified. The preferred substitute The preferred substitute cluster
+ dimension must be an integer multiple greater than zero of the regular
+ cluster dimension and must divide the grid. It must also be no more than
+ `maxBlocksPerCluster`, if it is set in the kernel's `__launch_bounds__`.
+ Otherwise it must be less than the maximum value the driver can support.
+ Otherwise, setting this attribute to a value physically unable to fit on any
+ particular device is permitted.
 
  The effect of other attributes is consistent with their effect when set via
  persistent APIs.
@@ -11987,12 +12852,6 @@ status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config);
  grid of blocks. Each block contains \p blockDimX x \p blockDimY x
  \p blockDimZ threads.
 
- Note that the API can also be used to launch context-less kernel ::CUkernel
- by querying the handle using ::cuLibraryGetKernel() and then passing it
- to the API by casting to ::CUfunction. Here, the context to launch
- the kernel on will either be taken from the specified stream \p hStream
- or the current context in case of NULL stream.
-
  \p sharedMemBytes sets the amount of dynamic shared memory that will be
  available to each thread block.
 
@@ -15018,18 +15877,22 @@ CUgraphInstantiateResult result_out;
  contained \p memsetParams at instantiation.  hNode must remain in the graph which was
  used to instantiate \p hGraphExec.  Changed edges to and from hNode are ignored.
 
- The destination memory in \p memsetParams must be allocated from the same
- contexts as the original destination memory.  Both the instantiation-time
- memory operand and the memory operand in \p memsetParams must be 1-dimensional.
- Zero-length operations are not supported.
+ Zero sized operations are not supported.
+
+ The new destination pointer in memsetParams must be to the same kind of allocation
+ as the original destination pointer and have the same context association and device mapping
+ as the original destination pointer.
+
+ Both the value and pointer address may be updated.
+ Changing other aspects of the memset (width, height, element size or pitch) may cause the update to be rejected.
+ Specifically, for 2d memsets, all dimension changes are rejected.
+ For 1d memsets, changes in height are explicitly rejected and other changes are oportunistically allowed
+ if the resulting work maps onto the work resources already allocated for the node.
 
  The modifications only affect future launches of \p hGraphExec.  Already enqueued
  or running launches of \p hGraphExec are not affected by this call.  hNode is also
  not modified by this call.
 
- Returns CUDA_ERROR_INVALID_VALUE if the memory operand's mappings changed or
- either the original or new memory operand are multidimensional.
-
  \param hGraphExec   - The executable graph in which to set the specified node
  \param hNode        - Memset node from the graph which was used to instantiate graphExec
  \param memsetParams - The updated parameters to set
@@ -15517,7 +16380,9 @@ CUgraphInstantiateResult result_out;
    - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change.
    - The source/destination memory must be allocated from the same contexts as the original
      source/destination memory.
-   - Only 1D memsets can be changed.
+   - For 2d memsets, only address and assinged value may be updated.
+   - For 1d memsets, updating dimensions is also allowed, but may fail if the resulting operation doesn't
+     map onto the work resources already allocated for the node.
  - Additional memcpy node restrictions:
    - Changing either the source or destination memory type(i.e. CU_MEMORYTYPE_DEVICE,
      CU_MEMORYTYPE_ARRAY, etc.) is not supported.
@@ -15989,6 +16854,7 @@ CUgraphInstantiateResult result_out;
  \param hGraph             - Graph which will contain the conditional node using this handle.
  \param ctx                - Context for the handle and associated conditional node.
  \param defaultLaunchValue - Optional initial value for the conditional variable.
+                             Applied at the beginning of each graph execution if CU_GRAPH_COND_ASSIGN_DEFAULT is set in \p flags.
  \param flags              - Currently must be CU_GRAPH_COND_ASSIGN_DEFAULT or 0.
 
  \return
@@ -16012,6 +16878,11 @@ CUgraphInstantiateResult result_out;
  Returns in \p *numBlocks the number of the maximum active blocks per
  streaming multiprocessor.
 
+ Note that the API can also be used with context-less kernel ::CUkernel
+ by querying the handle using ::cuLibraryGetKernel() and then passing it
+ to the API by casting to ::CUfunction. Here, the context to use for calculations
+ will be the current context.
+
  \param numBlocks       - Returned occupancy
  \param func            - Kernel for which occupancy is calculated
  \param blockSize       - Block size the kernel is intended to be launched with
@@ -16055,6 +16926,11 @@ CUgraphInstantiateResult result_out;
    can be found about this feature in the "Unified L1/Texture Cache"
    section of the Maxwell tuning guide.
 
+ Note that the API can also be with launch context-less kernel ::CUkernel
+ by querying the handle using ::cuLibraryGetKernel() and then passing it
+ to the API by casting to ::CUfunction. Here, the context to use for calculations
+ will be the current context.
+
  \param numBlocks       - Returned occupancy
  \param func            - Kernel for which occupancy is calculated
  \param blockSize       - Block size the kernel is intended to be launched with
@@ -16109,6 +16985,11 @@ CUgraphInstantiateResult result_out;
     size_t blockToSmem(int blockSize);
  \endcode
 
+ Note that the API can also be used with context-less kernel ::CUkernel
+ by querying the handle using ::cuLibraryGetKernel() and then passing it
+ to the API by casting to ::CUfunction. Here, the context to use for calculations
+ will be the current context.
+
  \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
  \param blockSize   - Returned maximum block size that can achieve the maximum occupancy
  \param func        - Kernel for which launch configuration is calculated
@@ -16158,6 +17039,11 @@ CUgraphInstantiateResult result_out;
    can be found about this feature in the "Unified L1/Texture Cache"
    section of the Maxwell tuning guide.
 
+ Note that the API can also be used with context-less kernel ::CUkernel
+ by querying the handle using ::cuLibraryGetKernel() and then passing it
+ to the API by casting to ::CUfunction. Here, the context to use for calculations
+ will be the current context.
+
  \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
  \param blockSize   - Returned maximum block size that can achieve the maximum occupancy
  \param func        - Kernel for which launch configuration is calculated
@@ -16190,6 +17076,11 @@ CUgraphInstantiateResult result_out;
 
  Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM.
 
+ Note that the API can also be used with context-less kernel ::CUkernel
+ by querying the handle using ::cuLibraryGetKernel() and then passing it
+ to the API by casting to ::CUfunction. Here, the context to use for calculations
+ will be the current context.
+
  \param dynamicSmemSize - Returned maximum dynamic shared memory
  \param func            - Kernel function for which occupancy is calculated
  \param numBlocks       - Number of blocks to fit on SM
@@ -16222,6 +17113,12 @@ CUgraphInstantiateResult result_out;
 
  This function will respect the compile time launch bounds.
 
+ Note that the API can also be used with context-less kernel ::CUkernel
+ by querying the handle using ::cuLibraryGetKernel() and then passing it
+ to the API by casting to ::CUfunction. Here, the context to use for calculations
+ will either be taken from the specified stream \p config->hStream
+ or the current context in case of NULL stream.
+
  \param clusterSize - Returned maximum cluster size that can be launched
                       for the given kernel function and launch configuration
  \param func        - Kernel function for which maximum cluster
@@ -16259,6 +17156,12 @@ CUgraphInstantiateResult result_out;
  calculation. Runtime environment may affect how the hardware schedules
  the clusters, so the calculated occupancy is not guaranteed to be achievable.
 
+ Note that the API can also be used with context-less kernel ::CUkernel
+ by querying the handle using ::cuLibraryGetKernel() and then passing it
+ to the API by casting to ::CUfunction. Here, the context to use for calculations
+ will either be taken from the specified stream \p config->hStream
+ or the current context in case of NULL stream.
+
  \param numClusters - Returned maximum number of clusters that
                       could co-exist on the target device
  \param func        - Kernel function for which maximum number
@@ -17192,7 +18095,8 @@ CU_TR_FILTER_MODE_LINEAR = 1
  \p pResViewDesc is an optional argument that specifies an alternate format for
  the data described by \p pResDesc, and also describes the subresource region
  to restrict access to when texturing. \p pResViewDesc can only be specified if
- the type of resource is a CUDA array or a CUDA mipmapped array.
+ the type of resource is a CUDA array or a CUDA mipmapped array not in a block
+ compressed format.
 
  Texture objects are only supported on devices of compute capability 3.0 or higher.
  Additionally, a texture object is an opaque value, and, as such, should only be
@@ -17570,7 +18474,7 @@ unsigned int lastLayer;
 
  Tensor map objects are only supported on devices of compute capability 9.0 or higher.
  Additionally, a tensor map object is an opaque value, and, as such, should only be
- accessed through CUDA API calls.
+ accessed through CUDA APIs and PTX.
 
  The parameters passed are bound to the following requirements:
 
@@ -17591,21 +18495,33 @@ CU_TENSOR_MAP_DATA_TYPE_FLOAT64,         // 8 bytes
 CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,        // 2 bytes
 CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ,     // 4 bytes
 CU_TENSOR_MAP_DATA_TYPE_TFLOAT32,        // 4 bytes
-CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ     // 4 bytes
+CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ,    // 4 bytes
+CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B,    // 4 bits
+CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B,   // 4 bits
+CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B    // 6 bits
 } CUtensorMapDataType;
  \endcode
+  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B copies '16 x U4' packed values to memory aligned as 8 bytes. There are no gaps between packed values.
+  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B copies '16 x U4' packed values to memory aligned as 16 bytes. There are 8 byte gaps between every 8 byte chunk of packed values.
+  ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B copies '16 x U6' packed values to memory aligned as 16 bytes. There are 4 byte gaps between every 12 byte chunk of packed values.
 
  - \p tensorRank must be non-zero and less than or equal to the maximum supported dimensionality of 5. If \p interleave is not
  ::CU_TENSOR_MAP_INTERLEAVE_NONE, then \p tensorRank must additionally be greater than or equal to 3.
 
- - \p globalAddress, which specifies the starting address of the memory region described, must be 32 byte aligned when \p interleave is
- ::CU_TENSOR_MAP_INTERLEAVE_32B and 16 byte aligned otherwise.
+ - \p globalAddress, which specifies the starting address of the memory region described, must be 16 byte aligned. The following requirements need to also be met:
+    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p globalAddress must be 32 byte aligned.
+    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p globalAddress must be 32 byte aligned.
 
  - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or
- equal to 2^32.
+ equal to 2^32. Additionally, the following requirements need to be met for the packed data types:
+    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, globalDim[0] must be a multiple of 128.
+    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, \p globalDim[0] must be a multiple of 2.
+    - Dimension for the packed data types must reflect the number of individual U# values.
 
  - \p globalStrides array, which specifies tensor stride of each of the lower \p tensorRank - 1 dimensions in bytes, must be a
- multiple of 16 and less than 2^40. Additionally, the stride must be a multiple of 32 when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B.
+ multiple of 16 and less than 2^40. Additionally, the following requirements need to be met:
+    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, the strides must be a multiple of 32.
+    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, the strides must be a multiple of 32.
  Each following dimension specified includes previous dimension stride:
  \code
 globalStrides[0] = globalDim[0] * elementSizeInBytes(tensorDataType) + padding[0];
@@ -17615,9 +18531,9 @@ assert(globalStrides[i] >= globalDim[i]);
  \endcode
 
  - \p boxDim array, which specifies number of elements to be traversed along each of the \p tensorRank dimensions, must be non-zero
- and less than or equal to 256.
- When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, { \p boxDim[0] * elementSizeInBytes( \p tensorDataType ) } must be a multiple
- of 16 bytes.
+ and less than or equal to 256. Additionally, the following requirements need to be met:
+    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, { \p boxDim[0] * elementSizeInBytes( \p tensorDataType ) } must be a multiple of 16 bytes.
+    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, boxDim[0] must be 128.
 
  - \p elementStrides array, which specifies the iteration step along each of the \p tensorRank dimensions, must be non-zero and less
  than or equal to 8. Note that when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the first element of this array is ignored since
@@ -17638,17 +18554,21 @@ CU_TENSOR_MAP_INTERLEAVE_32B
  uses 32 bytes.
  When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE and \p swizzle is not ::CU_TENSOR_MAP_SWIZZLE_NONE, the bounding box inner dimension
  (computed as \p boxDim[0] multiplied by element size derived from \p tensorDataType) must be less than or equal to the swizzle size.
-    - CU_TENSOR_MAP_SWIZZLE_32B implies the bounding box inner dimension will be <= 32.
-    - CU_TENSOR_MAP_SWIZZLE_64B implies the bounding box inner dimension will be <= 64.
-    - CU_TENSOR_MAP_SWIZZLE_128B implies the bounding box inner dimension will be <= 128.
+    - CU_TENSOR_MAP_SWIZZLE_32B requires the bounding box inner dimension to be <= 32.
+    - CU_TENSOR_MAP_SWIZZLE_64B requires the bounding box inner dimension to be <= 64.
+    - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner dimension to be <= 128.
+ Additionally, \p tensorDataType of ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B requires \p interleave to be ::CU_TENSOR_MAP_INTERLEAVE_NONE.
 
  - \p swizzle, which specifies the shared memory bank swizzling pattern, has to be of type ::CUtensorMapSwizzle which is defined as:
  \code
 typedef enum CUtensorMapSwizzle_enum {
 CU_TENSOR_MAP_SWIZZLE_NONE = 0,
-CU_TENSOR_MAP_SWIZZLE_32B,
-CU_TENSOR_MAP_SWIZZLE_64B,
-CU_TENSOR_MAP_SWIZZLE_128B
+CU_TENSOR_MAP_SWIZZLE_32B,                   // Swizzle 16B chunks within 32B  span
+CU_TENSOR_MAP_SWIZZLE_64B,                   // Swizzle 16B chunks within 64B  span
+CU_TENSOR_MAP_SWIZZLE_128B,                  // Swizzle 16B chunks within 128B span
+CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B,         // Swizzle 32B chunks within 128B span
+CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B_FLIP_8B, // Swizzle 32B chunks within 128B span, additionally swap lower 8B with upper 8B within each 16B for every alternate row
+CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B          // Swizzle 64B chunks within 128B span
 } CUtensorMapSwizzle;
  \endcode
  Data are organized in a specific order in global memory; however, this may not match the order in which the application accesses data
@@ -17656,6 +18576,15 @@ CU_TENSOR_MAP_SWIZZLE_128B
  problem, data can be loaded to shared memory with shuffling across shared memory banks.
  When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p swizzle must be ::CU_TENSOR_MAP_SWIZZLE_32B.
  Other interleave modes can have any swizzling pattern.
+ When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B, only the following swizzle modes are supported:
+    - CU_TENSOR_MAP_SWIZZLE_NONE (Load & Store)
+    - CU_TENSOR_MAP_SWIZZLE_128B (Load & Store)
+    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store)
+    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B (Store only)
+ When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, only the following swizzle modes are supported:
+    - CU_TENSOR_MAP_SWIZZLE_NONE (Load only)
+    - CU_TENSOR_MAP_SWIZZLE_128B (Load only)
+    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
 
  - \p l2Promotion specifies L2 fetch size which indicates the byte granurality at which L2 requests is filled from DRAM. It must be of
  type ::CUtensorMapL2promotion, which is defined as:
@@ -17676,7 +18605,8 @@ CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0,
 CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
 } CUtensorMapFloatOOBfill;
  \endcode
- Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type.
+ Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type,
+ and when \p tensorDataType is not ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, and ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B.
 
  \param tensorMap         - Tensor map object to create
  \param tensorDataType    - Tensor data type
@@ -17700,6 +18630,7 @@ CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
 
  \sa
  ::cuTensorMapEncodeIm2col,
+ ::cuTensorMapEncodeIm2colWide,
  ::cuTensorMapReplaceAddress*/
     fn cuTensorMapEncodeTiled(
         tensorMap: *mut cuda_types::cuda::CUtensorMap,
@@ -17722,7 +18653,7 @@ CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
 
  Tensor map objects are only supported on devices of compute capability 9.0 or higher.
  Additionally, a tensor map object is an opaque value, and, as such, should only be
- accessed through CUDA API calls.
+ accessed through CUDA APIs and PTX.
 
  The parameters passed are bound to the following requirements:
 
@@ -17744,19 +18675,31 @@ CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,        // 2 bytes
 CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ,     // 4 bytes
 CU_TENSOR_MAP_DATA_TYPE_TFLOAT32,        // 4 bytes
 CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ     // 4 bytes
+CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B,    // 4 bits
+CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B,   // 4 bits
+CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B    // 6 bits
 } CUtensorMapDataType;
  \endcode
+  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B copies '16 x U4' packed values to memory aligned as 8 bytes. There are no gaps between packed values.
+  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B copies '16 x U4' packed values to memory aligned as 16 bytes. There are 8 byte gaps between every 8 byte chunk of packed values.
+  ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B copies '16 x U6' packed values to memory aligned as 16 bytes. There are 4 byte gaps between every 12 byte chunk of packed values.
 
  - \p tensorRank, which specifies the number of tensor dimensions, must be 3, 4, or 5.
 
- - \p globalAddress, which specifies the starting address of the memory region described, must be 32 byte aligned when \p interleave is
- ::CU_TENSOR_MAP_INTERLEAVE_32B and 16 byte aligned otherwise.
+ - \p globalAddress, which specifies the starting address of the memory region described, must be 16 byte aligned. The following requirements need to also be met:
+    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p globalAddress must be 32 byte aligned.
+    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p globalAddress must be 32 byte aligned.
 
  - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or
- equal to 2^32.
+ equal to 2^32. Additionally, the following requirements need to be met for the packed data types:
+    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, globalDim[0] must be a multiple of 128.
+    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, \p globalDim[0] must be a multiple of 2.
+    - Dimension for the packed data types must reflect the number of individual U# values.
 
  - \p globalStrides array, which specifies tensor stride of each of the lower \p tensorRank - 1 dimensions in bytes, must be a
- multiple of 16 and less than 2^40. Additionally, the stride must be a multiple of 32 when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B.
+ multiple of 16 and less than 2^40. Additionally, the following requirements need to be met:
+    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, the strides must be a multiple of 32.
+    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, the strides must be a multiple of 32.
  Each following dimension specified includes previous dimension stride:
  \code
 globalStrides[0] = globalDim[0] * elementSizeInBytes(tensorDataType) + padding[0];
@@ -17779,6 +18722,7 @@ assert(globalStrides[i] >= globalDim[i]);
  The bounding box specified by \p pixelBoxLowerCorner and \p pixelBoxUpperCorner must have non-zero area.
 
  - \p channelsPerPixel, which specifies the number of elements which must be accessed along C dimension, must be less than or equal to 256.
+ Additionally, when \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p channelsPerPixel must be 128.
 
  - \p pixelsPerColumn, which specifies the number of elements that must be accessed along the {N, D, H, W} dimensions, must be less than or
  equal to 1024.
@@ -17801,18 +18745,22 @@ CU_TENSOR_MAP_INTERLEAVE_32B
  TMA supports interleaved layouts like NC/8HWC8 where C8 utilizes 16 bytes in memory assuming 2 byte per channel or NC/16HWC16 where C16
  uses 32 bytes.
  When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE and \p swizzle is not ::CU_TENSOR_MAP_SWIZZLE_NONE, the bounding box inner dimension
- (computed as \p boxDim[0] multiplied by element size derived from \p tensorDataType) must be less than or equal to the swizzle size.
-    - CU_TENSOR_MAP_SWIZZLE_32B implies the bounding box inner dimension will be <= 32.
-    - CU_TENSOR_MAP_SWIZZLE_64B implies the bounding box inner dimension will be <= 64.
-    - CU_TENSOR_MAP_SWIZZLE_128B implies the bounding box inner dimension will be <= 128.
+ (computed as \p channelsPerPixel multiplied by element size in bytes derived from \p tensorDataType) must be less than or equal to the swizzle size.
+    - CU_TENSOR_MAP_SWIZZLE_32B requires the bounding box inner dimension to be <= 32.
+    - CU_TENSOR_MAP_SWIZZLE_64B requires the bounding box inner dimension to be <= 64.
+    - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner dimension to be <= 128.
+ Additionally, \p tensorDataType of ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B requires \p interleave to be ::CU_TENSOR_MAP_INTERLEAVE_NONE.
 
  - \p swizzle, which specifies the shared memory bank swizzling pattern, has to be of type ::CUtensorMapSwizzle which is defined as:
  \code
 typedef enum CUtensorMapSwizzle_enum {
 CU_TENSOR_MAP_SWIZZLE_NONE = 0,
-CU_TENSOR_MAP_SWIZZLE_32B,
-CU_TENSOR_MAP_SWIZZLE_64B,
-CU_TENSOR_MAP_SWIZZLE_128B
+CU_TENSOR_MAP_SWIZZLE_32B,                   // Swizzle 16B chunks within 32B  span
+CU_TENSOR_MAP_SWIZZLE_64B,                   // Swizzle 16B chunks within 64B  span
+CU_TENSOR_MAP_SWIZZLE_128B,                  // Swizzle 16B chunks within 128B span
+CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B,         // Swizzle 32B chunks within 128B span
+CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B_FLIP_8B, // Swizzle 32B chunks within 128B span, additionally swap lower 8B with upper 8B within each 16B for every alternate row
+CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B          // Swizzle 64B chunks within 128B span
 } CUtensorMapSwizzle;
  \endcode
  Data are organized in a specific order in global memory; however, this may not match the order in which the application accesses data
@@ -17820,6 +18768,15 @@ CU_TENSOR_MAP_SWIZZLE_128B
  problem, data can be loaded to shared memory with shuffling across shared memory banks.
  When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p swizzle must be ::CU_TENSOR_MAP_SWIZZLE_32B.
  Other interleave modes can have any swizzling pattern.
+ When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B, only the following swizzle modes are supported:
+    - CU_TENSOR_MAP_SWIZZLE_NONE (Load & Store)
+    - CU_TENSOR_MAP_SWIZZLE_128B (Load & Store)
+    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store)
+    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B (Store only)
+ When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, only the following swizzle modes are supported:
+    - CU_TENSOR_MAP_SWIZZLE_NONE (Load only)
+    - CU_TENSOR_MAP_SWIZZLE_128B (Load only)
+    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
 
  - \p l2Promotion specifies L2 fetch size which indicates the byte granularity at which L2 requests are filled from DRAM. It must be of
  type ::CUtensorMapL2promotion, which is defined as:
@@ -17840,7 +18797,8 @@ CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0,
 CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
 } CUtensorMapFloatOOBfill;
  \endcode
- Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type.
+ Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type,
+ and when \p tensorDataType is not ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, and ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B.
 
  \param tensorMap             - Tensor map object to create
  \param tensorDataType        - Tensor data type
@@ -17867,6 +18825,7 @@ CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
 
  \sa
  ::cuTensorMapEncodeTiled,
+ ::cuTensorMapEncodeIm2colWide,
  ::cuTensorMapReplaceAddress*/
     fn cuTensorMapEncodeIm2col(
         tensorMap: *mut cuda_types::cuda::CUtensorMap,
@@ -17885,6 +18844,204 @@ CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
         l2Promotion: cuda_types::cuda::CUtensorMapL2promotion,
         oobFill: cuda_types::cuda::CUtensorMapFloatOOBfill,
     ) -> cuda_types::cuda::CUresult;
+    /** \brief Create a tensor map descriptor object representing im2col memory region, but where
+ the elements are exclusively loaded along the W dimension.
+
+ Creates a descriptor for Tensor Memory Access (TMA) object specified by the parameters
+ describing a im2col memory layout and where the row is always loaded along the W dimensuin
+ and returns it in \p tensorMap. This assumes the tensor layout in memory is either NDHWC,
+ NHWC, or NWC.
+
+ This API is only supported on devices of compute capability 10.0 or higher.
+ Additionally, a tensor map object is an opaque value, and, as such, should only be
+ accessed through CUDA APIs and PTX.
+
+ The parameters passed are bound to the following requirements:
+
+ - \p tensorMap address must be aligned to 64 bytes.
+
+ - \p tensorDataType has to be an enum from ::CUtensorMapDataType which is defined as:
+ \code
+typedef enum CUtensorMapDataType_enum {
+CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0,       // 1 byte
+CU_TENSOR_MAP_DATA_TYPE_UINT16,          // 2 bytes
+CU_TENSOR_MAP_DATA_TYPE_UINT32,          // 4 bytes
+CU_TENSOR_MAP_DATA_TYPE_INT32,           // 4 bytes
+CU_TENSOR_MAP_DATA_TYPE_UINT64,          // 8 bytes
+CU_TENSOR_MAP_DATA_TYPE_INT64,           // 8 bytes
+CU_TENSOR_MAP_DATA_TYPE_FLOAT16,         // 2 bytes
+CU_TENSOR_MAP_DATA_TYPE_FLOAT32,         // 4 bytes
+CU_TENSOR_MAP_DATA_TYPE_FLOAT64,         // 8 bytes
+CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,        // 2 bytes
+CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ,     // 4 bytes
+CU_TENSOR_MAP_DATA_TYPE_TFLOAT32,        // 4 bytes
+CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ     // 4 bytes
+CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B,    // 4 bits
+CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B,   // 4 bits
+CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B    // 6 bits
+} CUtensorMapDataType;
+ \endcode
+  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B copies '16 x U4' packed values to memory aligned as 8 bytes. There are no gaps between packed values.
+  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B copies '16 x U4' packed values to memory aligned as 16 bytes. There are 8 byte gaps between every 8 byte chunk of packed values.
+  ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B copies '16 x U6' packed values to memory aligned as 16 bytes. There are 4 byte gaps between every 12 byte chunk of packed values.
+
+ - \p tensorRank, which specifies the number of tensor dimensions, must be 3, 4, or 5.
+
+ - \p globalAddress, which specifies the starting address of the memory region described, must be 16 byte aligned. The following requirements need to also be met:
+    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p globalAddress must be 32 byte aligned.
+    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p globalAddress must be 32 byte aligned.
+
+ - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or
+ equal to 2^32. Additionally, the following requirements need to be met for the packed data types:
+    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, globalDim[0] must be a multiple of 128.
+    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, \p globalDim[0] must be a multiple of 2.
+    - Dimension for the packed data types must reflect the number of individual U# values.
+
+ - \p globalStrides array, which specifies tensor stride of each of the lower \p tensorRank - 1 dimensions in bytes, must be a
+ multiple of 16 and less than 2^40. Additionally, the following requirements need to be met:
+    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, the strides must be a multiple of 32.
+    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, the strides must be a multiple of 32.
+ Each following dimension specified includes previous dimension stride:
+ \code
+globalStrides[0] = globalDim[0] * elementSizeInBytes(tensorDataType) + padding[0];
+for (i = 1; i < tensorRank - 1; i++)
+globalStrides[i] = globalStrides[i – 1] * (globalDim[i] + padding[i]);
+assert(globalStrides[i] >= globalDim[i]);
+ \endcode
+
+ - \p pixelBoxLowerCornerWidth specifies the coordinate offset W of the bounding box from left corner. The offset must be
+ within range [-32768, 32767].
+
+ - \p pixelBoxUpperCornerWidth specifies the coordinate offset W of the bounding box from right corner. The offset must be
+ within range [-32768, 32767].
+
+ The bounding box specified by \p pixelBoxLowerCornerWidth and \p pixelBoxUpperCornerWidth must have non-zero area. Note
+ that the size of the box along D and H dimensions is always equal to one.
+
+ - \p channelsPerPixel, which specifies the number of elements which must be accessed along C dimension, must be less than or equal to 256.
+ Additionally, when \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p channelsPerPixel must be 128.
+
+ - \p pixelsPerColumn, which specifies the number of elements that must be accessed along the W dimension, must be less than or
+ equal to 1024. This field is ignored when \p mode is ::CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128.
+
+ - \p elementStrides array, which specifies the iteration step along each of the \p tensorRank dimensions, must be non-zero and less
+ than or equal to 8. Note that when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the first element of this array is ignored since
+ TMA doesn’t support the stride for dimension zero.
+ When all elements of the \p elementStrides array are one, \p boxDim specifies the number of elements to load. However, if \p elementStrides[i]
+ is not equal to one for some \p i, then TMA loads ceil( \p boxDim[i] / \p elementStrides[i]) number of elements along i-th dimension.
+ To load N elements along i-th dimension, \p boxDim[i] must be set to N * \p elementStrides[i].
+
+ - \p interleave specifies the interleaved layout of type ::CUtensorMapInterleave, which is defined as:
+ \code
+typedef enum CUtensorMapInterleave_enum {
+CU_TENSOR_MAP_INTERLEAVE_NONE = 0,
+CU_TENSOR_MAP_INTERLEAVE_16B,
+CU_TENSOR_MAP_INTERLEAVE_32B
+} CUtensorMapInterleave;
+ \endcode
+ TMA supports interleaved layouts like NC/8HWC8 where C8 utilizes 16 bytes in memory assuming 2 byte per channel or NC/16HWC16 where C16
+ uses 32 bytes.
+ When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the bounding box inner dimension (computed as \p channelsPerPixel multiplied by
+ element size in bytes derived from \p tensorDataType) must be less than or equal to the swizzle size.
+    - CU_TENSOR_MAP_SWIZZLE_64B requires the bounding box inner dimension to be <= 64.
+    - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner dimension to be <= 128.
+ Additionally, \p tensorDataType of ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B requires \p interleave to be ::CU_TENSOR_MAP_INTERLEAVE_NONE.
+
+ - \p mode, which describes loading of elements loaded along the W dimension, has to be one of the following ::CUtensorMapIm2ColWideMode types:
+ \code
+          CU_TENSOR_MAP_IM2COL_WIDE_MODE_W,
+          CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128
+ \endcode
+ ::CU_TENSOR_MAP_IM2COL_WIDE_MODE_W allows the number of elements loaded along the W dimension to be specified
+ via the \p pixelsPerColumn field.
+
+ - \p swizzle, which specifies the shared memory bank swizzling pattern, must be one of the following
+ ::CUtensorMapSwizzle modes (other swizzle modes are not supported):
+ \code
+typedef enum CUtensorMapSwizzle_enum {
+CU_TENSOR_MAP_SWIZZLE_64B,                   // Swizzle 16B chunks within 64B  span
+CU_TENSOR_MAP_SWIZZLE_128B,                  // Swizzle 16B chunks within 128B span
+CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B,         // Swizzle 32B chunks within 128B span
+} CUtensorMapSwizzle;
+ \endcode
+ Data are organized in a specific order in global memory; however, this may not match the order in which the application accesses data
+ in shared memory. This difference in data organization may cause bank conflicts when shared memory is accessed. In order to avoid this
+ problem, data can be loaded to shared memory with shuffling across shared memory banks.
+ When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B, only the following swizzle modes are supported:
+    - CU_TENSOR_MAP_SWIZZLE_128B (Load & Store)
+    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store)
+ When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, only the following swizzle modes are supported:
+    - CU_TENSOR_MAP_SWIZZLE_128B (Load only)
+    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
+
+ - \p l2Promotion specifies L2 fetch size which indicates the byte granularity at which L2 requests are filled from DRAM. It must be of
+ type ::CUtensorMapL2promotion, which is defined as:
+ \code
+typedef enum CUtensorMapL2promotion_enum {
+CU_TENSOR_MAP_L2_PROMOTION_NONE = 0,
+CU_TENSOR_MAP_L2_PROMOTION_L2_64B,
+CU_TENSOR_MAP_L2_PROMOTION_L2_128B,
+CU_TENSOR_MAP_L2_PROMOTION_L2_256B
+} CUtensorMapL2promotion;
+ \endcode
+
+ - \p oobFill, which indicates whether zero or a special NaN constant should be used to fill out-of-bound elements, must be of type
+ ::CUtensorMapFloatOOBfill which is defined as:
+ \code
+typedef enum CUtensorMapFloatOOBfill_enum {
+CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0,
+CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
+} CUtensorMapFloatOOBfill;
+ \endcode
+ Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type,
+ and when \p tensorDataType is not ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, and ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B.
+
+ \param tensorMap                - Tensor map object to create
+ \param tensorDataType           - Tensor data type
+ \param tensorRank               - Dimensionality of tensor; must be at least 3
+ \param globalAddress            - Starting address of memory region described by tensor
+ \param globalDim                - Array containing tensor size (number of elements) along each of the \p tensorRank dimensions
+ \param globalStrides            - Array containing stride size (in bytes) along each of the \p tensorRank - 1 dimensions
+ \param pixelBoxLowerCornerWidth - Width offset of left box corner
+ \param pixelBoxUpperCornerWidth - Width offset of right box corner
+ \param channelsPerPixel         - Number of channels per pixel
+ \param pixelsPerColumn          - Number of pixels per column
+ \param elementStrides           - Array containing traversal stride in each of the \p tensorRank dimensions
+ \param interleave               - Type of interleaved layout the tensor addresses
+ \param mode                     - W or W128 mode
+ \param swizzle                  - Bank swizzling pattern inside shared memory
+ \param l2Promotion              - L2 promotion size
+ \param oobFill                  - Indicate whether zero or special NaN constant will be used to fill out-of-bound elements
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuTensorMapEncodeTiled,
+ ::cuTensorMapEncodeIm2col,
+ ::cuTensorMapReplaceAddress*/
+    fn cuTensorMapEncodeIm2colWide(
+        tensorMap: *mut cuda_types::cuda::CUtensorMap,
+        tensorDataType: cuda_types::cuda::CUtensorMapDataType,
+        tensorRank: cuda_types::cuda::cuuint32_t,
+        globalAddress: *mut ::core::ffi::c_void,
+        globalDim: *const cuda_types::cuda::cuuint64_t,
+        globalStrides: *const cuda_types::cuda::cuuint64_t,
+        pixelBoxLowerCornerWidth: ::core::ffi::c_int,
+        pixelBoxUpperCornerWidth: ::core::ffi::c_int,
+        channelsPerPixel: cuda_types::cuda::cuuint32_t,
+        pixelsPerColumn: cuda_types::cuda::cuuint32_t,
+        elementStrides: *const cuda_types::cuda::cuuint32_t,
+        interleave: cuda_types::cuda::CUtensorMapInterleave,
+        mode: cuda_types::cuda::CUtensorMapIm2ColWideMode,
+        swizzle: cuda_types::cuda::CUtensorMapSwizzle,
+        l2Promotion: cuda_types::cuda::CUtensorMapL2promotion,
+        oobFill: cuda_types::cuda::CUtensorMapFloatOOBfill,
+    ) -> cuda_types::cuda::CUresult;
     /** \brief Modify an existing tensor map descriptor with an updated global address
 
  Modifies the descriptor for Tensor Memory Access (TMA) object passed in \p tensorMap with
@@ -17906,7 +19063,8 @@ CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
 
  \sa
  ::cuTensorMapEncodeTiled,
- ::cuTensorMapEncodeIm2col*/
+ ::cuTensorMapEncodeIm2col
+ ::cuTensorMapEncodeIm2colWide*/
     fn cuTensorMapReplaceAddress(
         tensorMap: *mut cuda_types::cuda::CUtensorMap,
         globalAddress: *mut ::core::ffi::c_void,
@@ -18399,10 +19557,12 @@ Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
       CU_CTX_USER_COREDUMP_ENABLE flag was set during context creation.
  - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
       also create a coredump. The default value is ::true unless set to ::false globally or
-      or locally.
+      or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
+      flag to disable host device abort() if needed.
  - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
       will not have a dump of GPU memory or non-reloc ELF images. The default value is
-      ::false unless set to ::true globally or locally.
+      ::false unless set to ::true globally or locally. This attribute is deprecated as
+      of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS instead.
  - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be
       created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default
       value is ::false unless set to ::true globally or locally.
@@ -18414,6 +19574,22 @@ Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
       that will be monitored if user-triggered coredumps are enabled. The default value is
       ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
       the CUDA application and ::PID is the process ID of the CUDA application.
+ - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
+      contained in a coredump specified as a bitwise OR combination of the following values:
+      + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
+          default settings of including all memory regions that it is able to access
+      + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
+          CUDA source modules that are not relocated at runtime.
+      + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
+          that does not belong to any context.
+      + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
+          for the warp that the dumped kernel belonged to.
+      + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
+      + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
+          the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
+      + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
+          process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
+          behavior.
 
  \param attrib - The enum defining which value to fetch.
  \param value - void* containing the requested data.
@@ -18450,10 +19626,13 @@ Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
       this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
       The default value is ::false.
  - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
-      also create a coredump. The default value is ::true.
+      also create a coredump. The default value is ::true unless set to ::false globally or
+      or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
+      flag to disable host device abort() if needed.
  - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
       will not have a dump of GPU memory or non-reloc ELF images. The default value is
-      ::false.
+      ::false. This attribute is deprecated as of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS
+      instead.
  - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be
       created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default
       value is ::false.
@@ -18465,6 +19644,22 @@ Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
       that will be monitored if user-triggered coredumps are enabled. The default value is
       ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
       the CUDA application and ::PID is the process ID of the CUDA application.
+ - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
+      contained in a coredump specified as a bitwise OR combination of the following values:
+      + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
+          default settings of including all memory regions that it is able to access
+      + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
+          CUDA source modules that are not relocated at runtime.
+      + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
+          that does not belong to any context.
+      + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
+          for the warp that the dumped kernel belonged to.
+      + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
+      + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
+          the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
+      + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
+          process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
+          behavior.
 
  \param attrib - The enum defining which value to fetch.
  \param value - void* containing the requested data.
@@ -18490,7 +19685,7 @@ Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
 
  An important design decision to note is that any coredump environment variable values
  set before CUDA initializes will take permanent precedence over any values set with this
- this function. This decision was made to ensure no change in behavior for any users that
+ function. This decision was made to ensure no change in behavior for any users that
  may be currently using these variables to get coredumps.
 
  \p *value shall contain the requested value specified by \p set. It is up to the caller
@@ -18510,14 +19705,33 @@ Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
       this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
       The default value is ::false.
  - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
-      also create a coredump. The default value is ::true.
+      also create a coredump. The default value is ::true unless set to ::false globally or
+      or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
+      flag to disable host device abort() if needed.
  - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
       will not have a dump of GPU memory or non-reloc ELF images. The default value is
-      ::false.
+      ::false. This attribute is deprecated as of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS
+      instead.
  - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where
       any coredumps generated by this context will be written. The default value is
       ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
       the CUDA applications and ::PID is the process ID of the CUDA application.
+ - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
+      contained in a coredump specified as a bitwise OR combination of the following values:
+      + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
+          default settings of including all memory regions that it is able to access
+      + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
+          CUDA source modules that are not relocated at runtime.
+      + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
+          that does not belong to any context.
+      + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
+          for the warp that the dumped kernel belonged to.
+      + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
+      + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
+          the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
+      + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
+          process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
+          behavior.
 
  \param attrib - The enum defining which value to set.
  \param value - void* containing the requested data.
@@ -18549,7 +19763,7 @@ Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
 
  An important design decision to note is that any coredump environment variable values
  set before CUDA initializes will take permanent precedence over any values set with this
- this function. This decision was made to ensure no change in behavior for any users that
+ function. This decision was made to ensure no change in behavior for any users that
  may be currently using these variables to get coredumps.
 
  \p *value shall contain the requested value specified by \p set. It is up to the caller
@@ -18563,10 +19777,13 @@ Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
       this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
       The default value is ::false.
  - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
-      also create a coredump. The default value is ::true.
+      also create a coredump. The default value is ::true unless set to ::false globally or
+      or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
+      flag to disable host device abort() if needed.
  - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
       will not have a dump of GPU memory or non-reloc ELF images. The default value is
-      ::false.
+      ::false. This attribute is deprecated as of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS
+      instead.
  - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be
       created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default
       value is ::false.
@@ -18579,6 +19796,22 @@ Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
       changed after ::CU_COREDUMP_ENABLE_USER_TRIGGER is set to ::true. The default
       value is ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine
       running the CUDA application and ::PID is the process ID of the CUDA application.
+ - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
+      contained in a coredump specified as a bitwise OR combination of the following values:
+      + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
+          default settings of including all memory regions that it is able to access
+      + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
+          CUDA source modules that are not relocated at runtime.
+      + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
+          that does not belong to any context.
+      + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
+          for the warp that the dumped kernel belonged to.
+      + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
+      + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
+          the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
+      + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
+          process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
+          behavior.
 
  \param attrib - The enum defining which value to set.
  \param value - void* containing the requested data.
@@ -18785,18 +20018,24 @@ Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
  first creating a descriptor and a green context with that descriptor.
 
  When creating the groups, the API will take into account the performance and functional characteristics of the
- input resource, and guarantee a split that will create a disjoint set of symmetrical partitions. This may lead to less groups created
+ input resource, and guarantee a split that will create a disjoint set of symmetrical partitions. This may lead to fewer groups created
  than purely dividing the total SM count by the \p minCount due to cluster requirements or
  alignment and granularity requirements for the minCount.
 
- The \p remainder set, might not have the same functional or performance guarantees as the groups in \p result.
+ The \p remainder set does not have the same functional or performance guarantees as the groups in \p result.
  Its use should be carefully planned and future partitions of the \p remainder set are discouraged.
 
+ The following flags are supported:
+ - \p CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING : Lower the minimum SM count and alignment, and treat each SM independent of its hierarchy.
+  This allows more fine grained partitions but at the cost of advanced features (such as large clusters on compute capability 9.0+).
+ - \p CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE : Compute Capability 9.0+ only. Attempt to create groups that may allow
+  for maximally sized thread clusters. This can be queried post green context creation using ::cuOccupancyMaxPotentialClusterSize.
+
  A successful API call must either have:
- - A valid array of \p result pointers of size passed in \p nbGroups, with \p Input of type \p CU_DEV_RESOURCE_TYPE_SM.
- Value of \p minCount must be between 0 and the SM count specified in \p input. \p remaining and \p useFlags are optional.
- - NULL passed in for \p result, with a valid integer pointer in \p nbGroups and \p Input of type \p CU_DEV_RESOURCE_TYPE_SM.
- Value of \p minCount must be between 0 and the SM count specified in \p input.
+ - A valid array of \p result pointers of size passed in \p nbGroups, with \p input of type \p CU_DEV_RESOURCE_TYPE_SM.
+ Value of \p minCount must be between 0 and the SM count specified in \p input. \p remaining may be NULL.
+ - NULL passed in for \p result, with a valid integer pointer in \p nbGroups and \p input of type \p CU_DEV_RESOURCE_TYPE_SM.
+ Value of \p minCount must be between 0 and the SM count specified in \p input. \p remaining may be NULL.
  This queries the number of groups that would be created by the API.
 
  Note: The API is not supported on 32-bit platforms.
@@ -18806,7 +20045,7 @@ Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
  \param input - Input SM resource to be split. Must be a valid \p CU_DEV_RESOURCE_TYPE_SM resource.
  \param remaining - If the input resource cannot be cleanly split among \p nbGroups, the remaining is placed in here.
  Can be ommitted (NULL) if the user does not need the remaining set.
- \param useFlags - Flags specifying how these partitions are used or which constraints to abide by when splitting the input.
+ \param useFlags - Flags specifying how these partitions are used or which constraints to abide by when splitting the input. Zero is valid for default behavior.
  \param minCount - Minimum number of SMs required
 
  \return
@@ -18832,10 +20071,18 @@ Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
     ) -> cuda_types::cuda::CUresult;
     /** \brief Generate a resource descriptor
 
- Generates a resource descriptor with the set of resources specified in \p resources.
+ Generates a single resource descriptor with the set of resources specified in \p resources.
  The generated resource descriptor is necessary for the creation of green contexts via the ::cuGreenCtxCreate API.
- The API expects \p nbResources == 1, as there is only one type of resource and merging the same
- types of resource is currently not supported.
+ Resources of the same type can be passed in, provided they meet the requirements as noted below.
+
+ A successful API call must have:
+ - A valid output pointer for the \p phDesc descriptor as well as a valid array of \p resources pointers,
+ with the array size passed in \p nbResources.
+ If multiple resources are provided in \p resources, the device they came from must be the same,
+ otherwise CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION is returned.
+ If multiple resources are provided in \p resources and they are of type ::CU_DEV_RESOURCE_TYPE_SM,
+ they must be outputs (whether \p result or \p remaining) from the same split API instance,
+ otherwise CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION is returned.
 
  Note: The API is not supported on 32-bit platforms.
 
@@ -18860,15 +20107,16 @@ Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
     ) -> cuda_types::cuda::CUresult;
     /** \brief Records an event.
 
- Captures in \phEvent all the activities of the green context of \phCtx
- at the time of this call. \phEvent and \phCtx must be from the same
- CUDA context. Calls such as ::cuEventQuery() or ::cuGreenCtxWaitEvent() will
+ Captures in \p hEvent all the activities of the green context of \p hCtx
+ at the time of this call. \p hEvent and \p hCtx must be from the same
+ primary context otherwise ::CUDA_ERROR_INVALID_HANDLE is returned.
+ Calls such as ::cuEventQuery() or ::cuGreenCtxWaitEvent() will
  then examine or wait for completion of the work that was captured. Uses of
  \p hCtx after this call do not modify \p hEvent.
 
- \note The API will return an error if the specified green context \p hCtx
- has a stream in the capture mode. In such a case, the call will invalidate
- all the conflicting captures.
+ \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED if the
+ specified green context \p hCtx has a stream in the capture mode. In such
+ a case, the call will invalidate all the conflicting captures.
 
  \param hCtx - Green context to record event for
  \param hEvent  - Event to record
@@ -18878,38 +20126,48 @@ Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
  ::CUDA_ERROR_DEINITIALIZED,
  ::CUDA_ERROR_NOT_INITIALIZED,
  ::CUDA_ERROR_INVALID_CONTEXT,
- ::CUDA_ERROR_INVALID_HANDLE
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
 
  \sa
  ::cuGreenCtxWaitEvent,
- ::cuEventRecord*/
+ ::cuEventRecord,
+ ::cuCtxRecordEvent,
+ ::cuCtxWaitEvent*/
     fn cuGreenCtxRecordEvent(
         hCtx: cuda_types::cuda::CUgreenCtx,
         hEvent: cuda_types::cuda::CUevent,
     ) -> cuda_types::cuda::CUresult;
     /** \brief Make a green context wait on an event
 
- Makes all future work submitted to green context \phCtx wait for all work
- captured in \phEvent. The synchronization will be performed on the device
+ Makes all future work submitted to green context \p hCtx wait for all work
+ captured in \p hEvent. The synchronization will be performed on the device
  and will not block the calling CPU thread. See ::cuGreenCtxRecordEvent()
- for details on what is captured by an event.
+ or ::cuEventRecord(), for details on what is captured by an event.
 
- \note The API will return an error and invalidate the capture if the specified
- event \p hEvent is part of an ongoing capture sequence.
+ \note \p hEvent may be from a different context or device than \p hCtx.
+
+ \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED and
+ invalidate the capture if the specified event \p hEvent is part of an
+ ongoing capture sequence or if the specified green context \p hCtx has
+ a stream in the capture mode.
 
  \param hCtx    - Green context to wait
- \param hEvent  - Event to wait on (may not be NULL)
+ \param hEvent  - Event to wait on
 
  \return
  ::CUDA_SUCCESS,
  ::CUDA_ERROR_DEINITIALIZED,
  ::CUDA_ERROR_NOT_INITIALIZED,
  ::CUDA_ERROR_INVALID_CONTEXT,
- ::CUDA_ERROR_INVALID_HANDLE
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
 
  \sa
  ::cuGreenCtxRecordEvent,
- ::cuStreamWaitEvent*/
+ ::cuStreamWaitEvent
+ ::cuCtxRecordEvent,
+ ::cuCtxWaitEvent*/
     fn cuGreenCtxWaitEvent(
         hCtx: cuda_types::cuda::CUgreenCtx,
         hEvent: cuda_types::cuda::CUevent,
@@ -18922,7 +20180,9 @@ Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
  The stream handle \p hStream can refer to any of the following:
  <ul>
    <li>
-   a stream created via any of the CUDA driver APIs such as ::cuStreamCreate.
+   a stream created via any of the CUDA driver APIs such as ::cuStreamCreate, ::cuStreamCreateWithPriority
+   and ::cuGreenCtxStreamCreate, or their runtime API equivalents such as
+   ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority.
    If during stream creation the context that was active in the calling thread was obtained
    with cuCtxFromGreenCtx, that green context is returned in \p phCtx.
    Otherwise, \p *phCtx is set to NULL instead.
@@ -18948,9 +20208,13 @@ Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
  \notefnerr
 
  \sa ::cuStreamDestroy,
+ ::cuStreamCreate,
  ::cuStreamCreateWithPriority,
+ ::cuStreamGetCtx_v2,
+ ::cuGreenCtxStreamCreate,
  ::cuStreamGetPriority,
  ::cuStreamGetFlags,
+ ::cuStreamGetDevice
  ::cuStreamWaitEvent,
  ::cuStreamQuery,
  ::cuStreamSynchronize,
@@ -18961,6 +20225,64 @@ Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
         hStream: cuda_types::cuda::CUstream,
         phCtx: *mut cuda_types::cuda::CUgreenCtx,
     ) -> cuda_types::cuda::CUresult;
+    /** \brief Create a stream for use in the green context
+
+ Creates a stream for use in the specified green context \p greenCtx and returns a handle in \p phStream.
+ The stream can be destroyed by calling ::cuStreamDestroy(). Note that the API ignores the context that
+ is current to the calling thread and creates a stream in the specified green context \p greenCtx.
+
+ The supported values for \p flags are:
+ - ::CU_STREAM_NON_BLOCKING: This must be specified. It indicates that work running in the created
+   stream may run concurrently with work in the default stream, and that
+   the created stream should perform no implicit synchronization with the default stream.
+
+ Specifying \p priority affects the scheduling priority of work in the stream. Priorities provide a
+ hint to preferentially run work with higher priority when possible, but do not preempt
+ already-running work or provide any other functional guarantee on execution order.
+ \p priority follows a convention where lower numbers represent higher priorities.
+ '0' represents default priority. The range of meaningful numerical priorities can
+ be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is
+ outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
+ it will automatically be clamped to the lowest or the highest number in the range.
+
+ \param phStream - Returned newly created stream
+ \param greenCtx - Green context for which to create the stream for
+ \param flags    - Flags for stream creation. \p CU_STREAM_NON_BLOCKING must be specified.
+ \param priority - Stream priority. Lower numbers represent higher priorities.
+                   See ::cuCtxGetStreamPriorityRange for more information about
+                   meaningful stream priorities that can be passed.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+ \notefnerr
+
+ \note In the current implementation, only compute kernels launched in
+ priority streams are affected by the stream's priority. Stream priorities have
+ no effect on host-to-device and device-to-host memory operations.
+
+ \sa ::cuStreamDestroy,
+ ::cuGreenCtxCreate
+ ::cuStreamCreate,
+ ::cuStreamGetPriority,
+ ::cuCtxGetStreamPriorityRange,
+ ::cuStreamGetFlags,
+ ::cuStreamGetDevice
+ ::cuStreamWaitEvent,
+ ::cuStreamQuery,
+ ::cuStreamSynchronize,
+ ::cuStreamAddCallback,
+ ::cudaStreamCreateWithPriority*/
+    fn cuGreenCtxStreamCreate(
+        phStream: *mut cuda_types::cuda::CUstream,
+        greenCtx: cuda_types::cuda::CUgreenCtx,
+        flags: ::core::ffi::c_uint,
+        priority: ::core::ffi::c_int,
+    ) -> cuda_types::cuda::CUresult;
     fn cuMemHostRegister(
         p: *mut ::core::ffi::c_void,
         bytesize: usize,
@@ -19396,6 +20718,24 @@ Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
         pCopy: *const cuda_types::cuda::CUDA_MEMCPY3D_PEER,
         hStream: cuda_types::cuda::CUstream,
     ) -> cuda_types::cuda::CUresult;
+    fn cuMemcpyBatchAsync(
+        dsts: *mut cuda_types::cuda::CUdeviceptr,
+        srcs: *mut cuda_types::cuda::CUdeviceptr,
+        sizes: *mut usize,
+        count: usize,
+        attrs: *mut cuda_types::cuda::CUmemcpyAttributes,
+        attrsIdxs: *mut usize,
+        numAttrs: usize,
+        failIdx: *mut usize,
+        hStream: cuda_types::cuda::CUstream,
+    ) -> cuda_types::cuda::CUresult;
+    fn cuMemcpy3DBatchAsync(
+        numOps: usize,
+        opList: *mut cuda_types::cuda::CUDA_MEMCPY3D_BATCH_OP,
+        failIdx: *mut usize,
+        flags: ::core::ffi::c_ulonglong,
+        hStream: cuda_types::cuda::CUstream,
+    ) -> cuda_types::cuda::CUresult;
     fn cuMemsetD8Async(
         dstDevice: cuda_types::cuda::CUdeviceptr,
         uc: ::core::ffi::c_uchar,
@@ -19450,10 +20790,19 @@ Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
         hStream: cuda_types::cuda::CUstream,
         flags: *mut ::core::ffi::c_uint,
     ) -> cuda_types::cuda::CUresult;
+    fn cuStreamGetDevice(
+        hStream: cuda_types::cuda::CUstream,
+        device: *mut cuda_types::cuda::CUdevice,
+    ) -> cuda_types::cuda::CUresult;
     fn cuStreamGetCtx(
         hStream: cuda_types::cuda::CUstream,
         pctx: *mut cuda_types::cuda::CUcontext,
     ) -> cuda_types::cuda::CUresult;
+    fn cuStreamGetCtx_v2(
+        hStream: cuda_types::cuda::CUstream,
+        pCtx: *mut cuda_types::cuda::CUcontext,
+        pGreenCtx: *mut cuda_types::cuda::CUgreenCtx,
+    ) -> cuda_types::cuda::CUresult;
     fn cuStreamWaitEvent(
         hStream: cuda_types::cuda::CUstream,
         hEvent: cuda_types::cuda::CUevent,
@@ -19803,12 +21152,144 @@ Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
         numDependencies: usize,
         flags: ::core::ffi::c_uint,
     ) -> cuda_types::cuda::CUresult;
+    fn cuMemBatchDecompressAsync(
+        paramsArray: *mut cuda_types::cuda::CUmemDecompressParams,
+        count: usize,
+        flags: ::core::ffi::c_uint,
+        errorIndex: *mut usize,
+        stream: cuda_types::cuda::CUstream,
+    ) -> cuda_types::cuda::CUresult;
     fn cuGetProcAddress(
         symbol: *const ::core::ffi::c_char,
         pfn: *mut *mut ::core::ffi::c_void,
         cudaVersion: ::core::ffi::c_int,
         flags: cuda_types::cuda::cuuint64_t,
     ) -> cuda_types::cuda::CUresult;
+    /** \brief Returns the restore thread ID for a CUDA process
+
+ Returns in \p *tid the thread ID of the CUDA restore thread for the process
+ specified by \p pid.
+
+ \param pid - The process ID of the CUDA process
+ \param tid - Returned restore thread ID
+
+ \return
+ ::CUDA_SUCCESS
+ ::CUDA_ERROR_INVALID_VALUE
+ ::CUDA_ERROR_NOT_INITIALIZED
+ ::CUDA_ERROR_NOT_SUPPORTED*/
+    fn cuCheckpointProcessGetRestoreThreadId(
+        pid: ::core::ffi::c_int,
+        tid: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cuda::CUresult;
+    /** \brief Returns the process state of a CUDA process
+
+ Returns in \p *state the current state of the CUDA process specified by \p pid.
+
+ \param pid - The process ID of the CUDA process
+ \param state - Returned CUDA process state
+
+ \return
+ ::CUDA_SUCCESS
+ ::CUDA_ERROR_INVALID_VALUE
+ ::CUDA_ERROR_NOT_INITIALIZED
+ ::CUDA_ERROR_NOT_SUPPORTED*/
+    fn cuCheckpointProcessGetState(
+        pid: ::core::ffi::c_int,
+        state: *mut cuda_types::cuda::CUprocessState,
+    ) -> cuda_types::cuda::CUresult;
+    /** \brief Lock a running CUDA process
+
+ Lock the CUDA process specified by \p pid which will block further CUDA API
+ calls. Process must be in the RUNNING state in order to lock.
+
+ Upon successful return the process will be in the LOCKED state.
+
+ If timeoutMs is specified and the timeout is reached the process will be left
+ in the RUNNING state upon return.
+
+ \param pid - The process ID of the CUDA process
+ \param args - Optional lock operation arguments
+
+ \return
+ ::CUDA_SUCCESS
+ ::CUDA_ERROR_INVALID_VALUE
+ ::CUDA_ERROR_NOT_INITIALIZED
+ ::CUDA_ERROR_ILLEGAL_STATE
+ ::CUDA_ERROR_NOT_SUPPORTED
+ ::CUDA_ERROR_NOT_READY*/
+    fn cuCheckpointProcessLock(
+        pid: ::core::ffi::c_int,
+        args: *mut cuda_types::cuda::CUcheckpointLockArgs,
+    ) -> cuda_types::cuda::CUresult;
+    /** \brief Checkpoint a CUDA process's GPU memory contents
+
+ Checkpoints a CUDA process specified by \p pid that is in the LOCKED
+ state. The GPU memory contents will be brought into host memory and all
+ underlying references will be released. Process must be in the LOCKED state
+ to checkpoint.
+
+ Upon successful return the process will be in the CHECKPOINTED state.
+
+ \param pid - The process ID of the CUDA process
+ \param args - Optional checkpoint operation arguments
+
+ \return
+ ::CUDA_SUCCESS
+ ::CUDA_ERROR_INVALID_VALUE
+ ::CUDA_ERROR_NOT_INITIALIZED
+ ::CUDA_ERROR_ILLEGAL_STATE
+ ::CUDA_ERROR_NOT_SUPPORTED*/
+    fn cuCheckpointProcessCheckpoint(
+        pid: ::core::ffi::c_int,
+        args: *mut cuda_types::cuda::CUcheckpointCheckpointArgs,
+    ) -> cuda_types::cuda::CUresult;
+    /** \brief Restore a CUDA process's GPU memory contents from its last checkpoint
+
+ Restores a CUDA process specified by \p pid from its last checkpoint. Process
+ must be in the CHECKPOINTED state to restore.
+
+ Upon successful return the process will be in the LOCKED state.
+
+ CUDA process restore requires persistence mode to be enabled or ::cuInit to
+ have been called before execution.
+
+ \param pid - The process ID of the CUDA process
+ \param args - Optional restore operation arguments
+
+ \return
+ ::CUDA_SUCCESS
+ ::CUDA_ERROR_INVALID_VALUE
+ ::CUDA_ERROR_NOT_INITIALIZED
+ ::CUDA_ERROR_ILLEGAL_STATE
+ ::CUDA_ERROR_NOT_SUPPORTED
+
+ \sa
+ ::cuInit*/
+    fn cuCheckpointProcessRestore(
+        pid: ::core::ffi::c_int,
+        args: *mut cuda_types::cuda::CUcheckpointRestoreArgs,
+    ) -> cuda_types::cuda::CUresult;
+    /** \brief Unlock a CUDA process to allow CUDA API calls
+
+ Unlocks a process specified by \p pid allowing it to resume making CUDA API
+ calls. Process must be in the LOCKED state.
+
+ Upon successful return the process will be in the RUNNING state.
+
+ \param pid - The process ID of the CUDA process
+ \param args - Optional unlock operation arguments
+
+ \return
+ ::CUDA_SUCCESS
+ ::CUDA_ERROR_INVALID_VALUE
+ ::CUDA_ERROR_NOT_INITIALIZED
+ ::CUDA_ERROR_ILLEGAL_STATE
+ ::CUDA_ERROR_NOT_SUPPORTED*/
+    fn cuCheckpointProcessUnlock(
+        pid: ::core::ffi::c_int,
+        args: *mut cuda_types::cuda::CUcheckpointUnlockArgs,
+    ) -> cuda_types::cuda::CUresult;
     /** \brief Initialize the profiling.
 
  \deprecated
diff --git a/cuda_base/src/cudnn8.rs b/cuda_base/src/cudnn8.rs
new file mode 100644
index 0000000..ff16b5a
--- /dev/null
+++ b/cuda_base/src/cudnn8.rs
@@ -0,0 +1,2579 @@
+// Generated automatically by zluda_bindgen
+// DO NOT EDIT MANUALLY
+#![allow(warnings)]
+extern "system" {
+    fn cudnnGetVersion() -> usize;
+    fn cudnnGetMaxDeviceVersion() -> usize;
+    fn cudnnGetCudartVersion() -> usize;
+    fn cudnnGetErrorString(
+        status: cuda_types::cudnn8::cudnnStatus_t,
+    ) -> *const ::core::ffi::c_char;
+    #[must_use]
+    fn cudnnQueryRuntimeError(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rstatus: *mut cuda_types::cudnn8::cudnnStatus_t,
+        mode: cuda_types::cudnn8::cudnnErrQueryMode_t,
+        tag: *mut cuda_types::cudnn8::cudnnRuntimeTag_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetProperty(
+        type_: cuda_types::cudnn8::libraryPropertyType,
+        value: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreate(
+        handle: *mut cuda_types::cudnn8::cudnnHandle_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroy(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetStream(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        streamId: cuda_types::cudnn8::cudaStream_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetStream(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        streamId: *mut cuda_types::cudnn8::cudaStream_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateTensorDescriptor(
+        tensorDesc: *mut cuda_types::cudnn8::cudnnTensorDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetTensor4dDescriptor(
+        tensorDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        format: cuda_types::cudnn8::cudnnTensorFormat_t,
+        dataType: cuda_types::cudnn8::cudnnDataType_t,
+        n: ::core::ffi::c_int,
+        c: ::core::ffi::c_int,
+        h: ::core::ffi::c_int,
+        w: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetTensor4dDescriptorEx(
+        tensorDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dataType: cuda_types::cudnn8::cudnnDataType_t,
+        n: ::core::ffi::c_int,
+        c: ::core::ffi::c_int,
+        h: ::core::ffi::c_int,
+        w: ::core::ffi::c_int,
+        nStride: ::core::ffi::c_int,
+        cStride: ::core::ffi::c_int,
+        hStride: ::core::ffi::c_int,
+        wStride: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetTensor4dDescriptor(
+        tensorDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dataType: *mut cuda_types::cudnn8::cudnnDataType_t,
+        n: *mut ::core::ffi::c_int,
+        c: *mut ::core::ffi::c_int,
+        h: *mut ::core::ffi::c_int,
+        w: *mut ::core::ffi::c_int,
+        nStride: *mut ::core::ffi::c_int,
+        cStride: *mut ::core::ffi::c_int,
+        hStride: *mut ::core::ffi::c_int,
+        wStride: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetTensorNdDescriptor(
+        tensorDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dataType: cuda_types::cudnn8::cudnnDataType_t,
+        nbDims: ::core::ffi::c_int,
+        dimA: *const ::core::ffi::c_int,
+        strideA: *const ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetTensorNdDescriptorEx(
+        tensorDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        format: cuda_types::cudnn8::cudnnTensorFormat_t,
+        dataType: cuda_types::cudnn8::cudnnDataType_t,
+        nbDims: ::core::ffi::c_int,
+        dimA: *const ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetTensorNdDescriptor(
+        tensorDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        nbDimsRequested: ::core::ffi::c_int,
+        dataType: *mut cuda_types::cudnn8::cudnnDataType_t,
+        nbDims: *mut ::core::ffi::c_int,
+        dimA: *mut ::core::ffi::c_int,
+        strideA: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetTensorSizeInBytes(
+        tensorDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        size: *mut usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyTensorDescriptor(
+        tensorDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnInitTransformDest(
+        transformDesc: cuda_types::cudnn8::cudnnTensorTransformDescriptor_t,
+        srcDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        destDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        destSizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateTensorTransformDescriptor(
+        transformDesc: *mut cuda_types::cudnn8::cudnnTensorTransformDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetTensorTransformDescriptor(
+        transformDesc: cuda_types::cudnn8::cudnnTensorTransformDescriptor_t,
+        nbDims: u32,
+        destFormat: cuda_types::cudnn8::cudnnTensorFormat_t,
+        padBeforeA: *const i32,
+        padAfterA: *const i32,
+        foldA: *const u32,
+        direction: cuda_types::cudnn8::cudnnFoldingDirection_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetTensorTransformDescriptor(
+        transformDesc: cuda_types::cudnn8::cudnnTensorTransformDescriptor_t,
+        nbDimsRequested: u32,
+        destFormat: *mut cuda_types::cudnn8::cudnnTensorFormat_t,
+        padBeforeA: *mut i32,
+        padAfterA: *mut i32,
+        foldA: *mut u32,
+        direction: *mut cuda_types::cudnn8::cudnnFoldingDirection_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyTensorTransformDescriptor(
+        transformDesc: cuda_types::cudnn8::cudnnTensorTransformDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnTransformTensor(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        alpha: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnTransformTensorEx(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        transDesc: cuda_types::cudnn8::cudnnTensorTransformDescriptor_t,
+        alpha: *const ::core::ffi::c_void,
+        srcDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        srcData: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        destDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        destData: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnAddTensor(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        alpha: *const ::core::ffi::c_void,
+        aDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        A: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        cDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        C: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateOpTensorDescriptor(
+        opTensorDesc: *mut cuda_types::cudnn8::cudnnOpTensorDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetOpTensorDescriptor(
+        opTensorDesc: cuda_types::cudnn8::cudnnOpTensorDescriptor_t,
+        opTensorOp: cuda_types::cudnn8::cudnnOpTensorOp_t,
+        opTensorCompType: cuda_types::cudnn8::cudnnDataType_t,
+        opTensorNanOpt: cuda_types::cudnn8::cudnnNanPropagation_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetOpTensorDescriptor(
+        opTensorDesc: cuda_types::cudnn8::cudnnOpTensorDescriptor_t,
+        opTensorOp: *mut cuda_types::cudnn8::cudnnOpTensorOp_t,
+        opTensorCompType: *mut cuda_types::cudnn8::cudnnDataType_t,
+        opTensorNanOpt: *mut cuda_types::cudnn8::cudnnNanPropagation_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyOpTensorDescriptor(
+        opTensorDesc: cuda_types::cudnn8::cudnnOpTensorDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnOpTensor(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        opTensorDesc: cuda_types::cudnn8::cudnnOpTensorDescriptor_t,
+        alpha1: *const ::core::ffi::c_void,
+        aDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        A: *const ::core::ffi::c_void,
+        alpha2: *const ::core::ffi::c_void,
+        bDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        B: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        cDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        C: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateReduceTensorDescriptor(
+        reduceTensorDesc: *mut cuda_types::cudnn8::cudnnReduceTensorDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetReduceTensorDescriptor(
+        reduceTensorDesc: cuda_types::cudnn8::cudnnReduceTensorDescriptor_t,
+        reduceTensorOp: cuda_types::cudnn8::cudnnReduceTensorOp_t,
+        reduceTensorCompType: cuda_types::cudnn8::cudnnDataType_t,
+        reduceTensorNanOpt: cuda_types::cudnn8::cudnnNanPropagation_t,
+        reduceTensorIndices: cuda_types::cudnn8::cudnnReduceTensorIndices_t,
+        reduceTensorIndicesType: cuda_types::cudnn8::cudnnIndicesType_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetReduceTensorDescriptor(
+        reduceTensorDesc: cuda_types::cudnn8::cudnnReduceTensorDescriptor_t,
+        reduceTensorOp: *mut cuda_types::cudnn8::cudnnReduceTensorOp_t,
+        reduceTensorCompType: *mut cuda_types::cudnn8::cudnnDataType_t,
+        reduceTensorNanOpt: *mut cuda_types::cudnn8::cudnnNanPropagation_t,
+        reduceTensorIndices: *mut cuda_types::cudnn8::cudnnReduceTensorIndices_t,
+        reduceTensorIndicesType: *mut cuda_types::cudnn8::cudnnIndicesType_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyReduceTensorDescriptor(
+        reduceTensorDesc: cuda_types::cudnn8::cudnnReduceTensorDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetReductionIndicesSize(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        reduceTensorDesc: cuda_types::cudnn8::cudnnReduceTensorDescriptor_t,
+        aDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        cDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetReductionWorkspaceSize(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        reduceTensorDesc: cuda_types::cudnn8::cudnnReduceTensorDescriptor_t,
+        aDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        cDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnReduceTensor(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        reduceTensorDesc: cuda_types::cudnn8::cudnnReduceTensorDescriptor_t,
+        indices: *mut ::core::ffi::c_void,
+        indicesSizeInBytes: usize,
+        workspace: *mut ::core::ffi::c_void,
+        workspaceSizeInBytes: usize,
+        alpha: *const ::core::ffi::c_void,
+        aDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        A: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        cDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        C: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetTensor(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+        valuePtr: *const ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnScaleTensor(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+        alpha: *const ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateFilterDescriptor(
+        filterDesc: *mut cuda_types::cudnn8::cudnnFilterDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetFilter4dDescriptor(
+        filterDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        dataType: cuda_types::cudnn8::cudnnDataType_t,
+        format: cuda_types::cudnn8::cudnnTensorFormat_t,
+        k: ::core::ffi::c_int,
+        c: ::core::ffi::c_int,
+        h: ::core::ffi::c_int,
+        w: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetFilter4dDescriptor(
+        filterDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        dataType: *mut cuda_types::cudnn8::cudnnDataType_t,
+        format: *mut cuda_types::cudnn8::cudnnTensorFormat_t,
+        k: *mut ::core::ffi::c_int,
+        c: *mut ::core::ffi::c_int,
+        h: *mut ::core::ffi::c_int,
+        w: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetFilterNdDescriptor(
+        filterDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        dataType: cuda_types::cudnn8::cudnnDataType_t,
+        format: cuda_types::cudnn8::cudnnTensorFormat_t,
+        nbDims: ::core::ffi::c_int,
+        filterDimA: *const ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetFilterNdDescriptor(
+        filterDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        nbDimsRequested: ::core::ffi::c_int,
+        dataType: *mut cuda_types::cudnn8::cudnnDataType_t,
+        format: *mut cuda_types::cudnn8::cudnnTensorFormat_t,
+        nbDims: *mut ::core::ffi::c_int,
+        filterDimA: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetFilterSizeInBytes(
+        filterDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        size: *mut usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnTransformFilter(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        transDesc: cuda_types::cudnn8::cudnnTensorTransformDescriptor_t,
+        alpha: *const ::core::ffi::c_void,
+        srcDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        srcData: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        destDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        destData: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyFilterDescriptor(
+        filterDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSoftmaxForward(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        algo: cuda_types::cudnn8::cudnnSoftmaxAlgorithm_t,
+        mode: cuda_types::cudnn8::cudnnSoftmaxMode_t,
+        alpha: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreatePoolingDescriptor(
+        poolingDesc: *mut cuda_types::cudnn8::cudnnPoolingDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetPooling2dDescriptor(
+        poolingDesc: cuda_types::cudnn8::cudnnPoolingDescriptor_t,
+        mode: cuda_types::cudnn8::cudnnPoolingMode_t,
+        maxpoolingNanOpt: cuda_types::cudnn8::cudnnNanPropagation_t,
+        windowHeight: ::core::ffi::c_int,
+        windowWidth: ::core::ffi::c_int,
+        verticalPadding: ::core::ffi::c_int,
+        horizontalPadding: ::core::ffi::c_int,
+        verticalStride: ::core::ffi::c_int,
+        horizontalStride: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetPooling2dDescriptor(
+        poolingDesc: cuda_types::cudnn8::cudnnPoolingDescriptor_t,
+        mode: *mut cuda_types::cudnn8::cudnnPoolingMode_t,
+        maxpoolingNanOpt: *mut cuda_types::cudnn8::cudnnNanPropagation_t,
+        windowHeight: *mut ::core::ffi::c_int,
+        windowWidth: *mut ::core::ffi::c_int,
+        verticalPadding: *mut ::core::ffi::c_int,
+        horizontalPadding: *mut ::core::ffi::c_int,
+        verticalStride: *mut ::core::ffi::c_int,
+        horizontalStride: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetPoolingNdDescriptor(
+        poolingDesc: cuda_types::cudnn8::cudnnPoolingDescriptor_t,
+        mode: cuda_types::cudnn8::cudnnPoolingMode_t,
+        maxpoolingNanOpt: cuda_types::cudnn8::cudnnNanPropagation_t,
+        nbDims: ::core::ffi::c_int,
+        windowDimA: *const ::core::ffi::c_int,
+        paddingA: *const ::core::ffi::c_int,
+        strideA: *const ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetPoolingNdDescriptor(
+        poolingDesc: cuda_types::cudnn8::cudnnPoolingDescriptor_t,
+        nbDimsRequested: ::core::ffi::c_int,
+        mode: *mut cuda_types::cudnn8::cudnnPoolingMode_t,
+        maxpoolingNanOpt: *mut cuda_types::cudnn8::cudnnNanPropagation_t,
+        nbDims: *mut ::core::ffi::c_int,
+        windowDimA: *mut ::core::ffi::c_int,
+        paddingA: *mut ::core::ffi::c_int,
+        strideA: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetPoolingNdForwardOutputDim(
+        poolingDesc: cuda_types::cudnn8::cudnnPoolingDescriptor_t,
+        inputTensorDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        nbDims: ::core::ffi::c_int,
+        outputTensorDimA: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetPooling2dForwardOutputDim(
+        poolingDesc: cuda_types::cudnn8::cudnnPoolingDescriptor_t,
+        inputTensorDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        n: *mut ::core::ffi::c_int,
+        c: *mut ::core::ffi::c_int,
+        h: *mut ::core::ffi::c_int,
+        w: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyPoolingDescriptor(
+        poolingDesc: cuda_types::cudnn8::cudnnPoolingDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnPoolingForward(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        poolingDesc: cuda_types::cudnn8::cudnnPoolingDescriptor_t,
+        alpha: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateActivationDescriptor(
+        activationDesc: *mut cuda_types::cudnn8::cudnnActivationDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetActivationDescriptor(
+        activationDesc: cuda_types::cudnn8::cudnnActivationDescriptor_t,
+        mode: cuda_types::cudnn8::cudnnActivationMode_t,
+        reluNanOpt: cuda_types::cudnn8::cudnnNanPropagation_t,
+        coef: f64,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetActivationDescriptor(
+        activationDesc: cuda_types::cudnn8::cudnnActivationDescriptor_t,
+        mode: *mut cuda_types::cudnn8::cudnnActivationMode_t,
+        reluNanOpt: *mut cuda_types::cudnn8::cudnnNanPropagation_t,
+        coef: *mut f64,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetActivationDescriptorSwishBeta(
+        activationDesc: cuda_types::cudnn8::cudnnActivationDescriptor_t,
+        swish_beta: f64,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetActivationDescriptorSwishBeta(
+        activationDesc: cuda_types::cudnn8::cudnnActivationDescriptor_t,
+        swish_beta: *mut f64,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyActivationDescriptor(
+        activationDesc: cuda_types::cudnn8::cudnnActivationDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnActivationForward(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        activationDesc: cuda_types::cudnn8::cudnnActivationDescriptor_t,
+        alpha: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateLRNDescriptor(
+        normDesc: *mut cuda_types::cudnn8::cudnnLRNDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetLRNDescriptor(
+        normDesc: cuda_types::cudnn8::cudnnLRNDescriptor_t,
+        lrnN: ::core::ffi::c_uint,
+        lrnAlpha: f64,
+        lrnBeta: f64,
+        lrnK: f64,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetLRNDescriptor(
+        normDesc: cuda_types::cudnn8::cudnnLRNDescriptor_t,
+        lrnN: *mut ::core::ffi::c_uint,
+        lrnAlpha: *mut f64,
+        lrnBeta: *mut f64,
+        lrnK: *mut f64,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyLRNDescriptor(
+        lrnDesc: cuda_types::cudnn8::cudnnLRNDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnLRNCrossChannelForward(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        normDesc: cuda_types::cudnn8::cudnnLRNDescriptor_t,
+        lrnMode: cuda_types::cudnn8::cudnnLRNMode_t,
+        alpha: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDivisiveNormalizationForward(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        normDesc: cuda_types::cudnn8::cudnnLRNDescriptor_t,
+        mode: cuda_types::cudnn8::cudnnDivNormMode_t,
+        alpha: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        means: *const ::core::ffi::c_void,
+        temp: *mut ::core::ffi::c_void,
+        temp2: *mut ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDeriveBNTensorDescriptor(
+        derivedBnDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        mode: cuda_types::cudnn8::cudnnBatchNormMode_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBatchNormalizationForwardInference(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        mode: cuda_types::cudnn8::cudnnBatchNormMode_t,
+        alpha: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+        bnScaleBiasMeanVarDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        bnScale: *const ::core::ffi::c_void,
+        bnBias: *const ::core::ffi::c_void,
+        estimatedMean: *const ::core::ffi::c_void,
+        estimatedVariance: *const ::core::ffi::c_void,
+        epsilon: f64,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDeriveNormTensorDescriptor(
+        derivedNormScaleBiasDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        derivedNormMeanVarDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        mode: cuda_types::cudnn8::cudnnNormMode_t,
+        groupCnt: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnNormalizationForwardInference(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        mode: cuda_types::cudnn8::cudnnNormMode_t,
+        normOps: cuda_types::cudnn8::cudnnNormOps_t,
+        algo: cuda_types::cudnn8::cudnnNormAlgo_t,
+        alpha: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        normScaleBiasDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        normScale: *const ::core::ffi::c_void,
+        normBias: *const ::core::ffi::c_void,
+        normMeanVarDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        estimatedMean: *const ::core::ffi::c_void,
+        estimatedVariance: *const ::core::ffi::c_void,
+        zDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        z: *const ::core::ffi::c_void,
+        activationDesc: cuda_types::cudnn8::cudnnActivationDescriptor_t,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+        epsilon: f64,
+        groupCnt: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateSpatialTransformerDescriptor(
+        stDesc: *mut cuda_types::cudnn8::cudnnSpatialTransformerDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetSpatialTransformerNdDescriptor(
+        stDesc: cuda_types::cudnn8::cudnnSpatialTransformerDescriptor_t,
+        samplerType: cuda_types::cudnn8::cudnnSamplerType_t,
+        dataType: cuda_types::cudnn8::cudnnDataType_t,
+        nbDims: ::core::ffi::c_int,
+        dimA: *const ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroySpatialTransformerDescriptor(
+        stDesc: cuda_types::cudnn8::cudnnSpatialTransformerDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSpatialTfGridGeneratorForward(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        stDesc: cuda_types::cudnn8::cudnnSpatialTransformerDescriptor_t,
+        theta: *const ::core::ffi::c_void,
+        grid: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSpatialTfSamplerForward(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        stDesc: cuda_types::cudnn8::cudnnSpatialTransformerDescriptor_t,
+        alpha: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        grid: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateDropoutDescriptor(
+        dropoutDesc: *mut cuda_types::cudnn8::cudnnDropoutDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyDropoutDescriptor(
+        dropoutDesc: cuda_types::cudnn8::cudnnDropoutDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDropoutGetStatesSize(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDropoutGetReserveSpaceSize(
+        xdesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetDropoutDescriptor(
+        dropoutDesc: cuda_types::cudnn8::cudnnDropoutDescriptor_t,
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        dropout: f32,
+        states: *mut ::core::ffi::c_void,
+        stateSizeInBytes: usize,
+        seed: ::core::ffi::c_ulonglong,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRestoreDropoutDescriptor(
+        dropoutDesc: cuda_types::cudnn8::cudnnDropoutDescriptor_t,
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        dropout: f32,
+        states: *mut ::core::ffi::c_void,
+        stateSizeInBytes: usize,
+        seed: ::core::ffi::c_ulonglong,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetDropoutDescriptor(
+        dropoutDesc: cuda_types::cudnn8::cudnnDropoutDescriptor_t,
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        dropout: *mut f32,
+        states: *mut *mut ::core::ffi::c_void,
+        seed: *mut ::core::ffi::c_ulonglong,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDropoutForward(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        dropoutDesc: cuda_types::cudnn8::cudnnDropoutDescriptor_t,
+        xdesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        ydesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+        reserveSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateAlgorithmDescriptor(
+        algoDesc: *mut cuda_types::cudnn8::cudnnAlgorithmDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetAlgorithmDescriptor(
+        algoDesc: cuda_types::cudnn8::cudnnAlgorithmDescriptor_t,
+        algorithm: cuda_types::cudnn8::cudnnAlgorithm_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetAlgorithmDescriptor(
+        algoDesc: cuda_types::cudnn8::cudnnAlgorithmDescriptor_t,
+        algorithm: *mut cuda_types::cudnn8::cudnnAlgorithm_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCopyAlgorithmDescriptor(
+        src: cuda_types::cudnn8::cudnnAlgorithmDescriptor_t,
+        dest: cuda_types::cudnn8::cudnnAlgorithmDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyAlgorithmDescriptor(
+        algoDesc: cuda_types::cudnn8::cudnnAlgorithmDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateAlgorithmPerformance(
+        algoPerf: *mut cuda_types::cudnn8::cudnnAlgorithmPerformance_t,
+        numberToCreate: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetAlgorithmPerformance(
+        algoPerf: cuda_types::cudnn8::cudnnAlgorithmPerformance_t,
+        algoDesc: cuda_types::cudnn8::cudnnAlgorithmDescriptor_t,
+        status: cuda_types::cudnn8::cudnnStatus_t,
+        time: f32,
+        memory: usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetAlgorithmPerformance(
+        algoPerf: cuda_types::cudnn8::cudnnAlgorithmPerformance_t,
+        algoDesc: *mut cuda_types::cudnn8::cudnnAlgorithmDescriptor_t,
+        status: *mut cuda_types::cudnn8::cudnnStatus_t,
+        time: *mut f32,
+        memory: *mut usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyAlgorithmPerformance(
+        algoPerf: *mut cuda_types::cudnn8::cudnnAlgorithmPerformance_t,
+        numberToDestroy: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetAlgorithmSpaceSize(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        algoDesc: cuda_types::cudnn8::cudnnAlgorithmDescriptor_t,
+        algoSpaceSizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSaveAlgorithm(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        algoDesc: cuda_types::cudnn8::cudnnAlgorithmDescriptor_t,
+        algoSpace: *mut ::core::ffi::c_void,
+        algoSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRestoreAlgorithm(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        algoSpace: *mut ::core::ffi::c_void,
+        algoSpaceSizeInBytes: usize,
+        algoDesc: cuda_types::cudnn8::cudnnAlgorithmDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetCallback(
+        mask: ::core::ffi::c_uint,
+        udata: *mut ::core::ffi::c_void,
+        fptr: cuda_types::cudnn8::cudnnCallback_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetCallback(
+        mask: *mut ::core::ffi::c_uint,
+        udata: *mut *mut ::core::ffi::c_void,
+        fptr: *mut cuda_types::cudnn8::cudnnCallback_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnOpsInferVersionCheck() -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSoftmaxBackward(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        algo: cuda_types::cudnn8::cudnnSoftmaxAlgorithm_t,
+        mode: cuda_types::cudnn8::cudnnSoftmaxMode_t,
+        alpha: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        dxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnPoolingBackward(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        poolingDesc: cuda_types::cudnn8::cudnnPoolingDescriptor_t,
+        alpha: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        dxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnActivationBackward(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        activationDesc: cuda_types::cudnn8::cudnnActivationDescriptor_t,
+        alpha: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        dxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnLRNCrossChannelBackward(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        normDesc: cuda_types::cudnn8::cudnnLRNDescriptor_t,
+        lrnMode: cuda_types::cudnn8::cudnnLRNMode_t,
+        alpha: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        dxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDivisiveNormalizationBackward(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        normDesc: cuda_types::cudnn8::cudnnLRNDescriptor_t,
+        mode: cuda_types::cudnn8::cudnnDivNormMode_t,
+        alpha: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        means: *const ::core::ffi::c_void,
+        dy: *const ::core::ffi::c_void,
+        temp: *mut ::core::ffi::c_void,
+        temp2: *mut ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        dXdMeansDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+        dMeans: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        mode: cuda_types::cudnn8::cudnnBatchNormMode_t,
+        bnOps: cuda_types::cudnn8::cudnnBatchNormOps_t,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        zDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        bnScaleBiasMeanVarDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        activationDesc: cuda_types::cudnn8::cudnnActivationDescriptor_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetBatchNormalizationBackwardExWorkspaceSize(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        mode: cuda_types::cudnn8::cudnnBatchNormMode_t,
+        bnOps: cuda_types::cudnn8::cudnnBatchNormOps_t,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dzDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dBnScaleBiasDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        activationDesc: cuda_types::cudnn8::cudnnActivationDescriptor_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        mode: cuda_types::cudnn8::cudnnBatchNormMode_t,
+        bnOps: cuda_types::cudnn8::cudnnBatchNormOps_t,
+        activationDesc: cuda_types::cudnn8::cudnnActivationDescriptor_t,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBatchNormalizationForwardTraining(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        mode: cuda_types::cudnn8::cudnnBatchNormMode_t,
+        alpha: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+        bnScaleBiasMeanVarDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        bnScale: *const ::core::ffi::c_void,
+        bnBias: *const ::core::ffi::c_void,
+        exponentialAverageFactor: f64,
+        resultRunningMean: *mut ::core::ffi::c_void,
+        resultRunningVariance: *mut ::core::ffi::c_void,
+        epsilon: f64,
+        resultSaveMean: *mut ::core::ffi::c_void,
+        resultSaveInvVariance: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBatchNormalizationForwardTrainingEx(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        mode: cuda_types::cudnn8::cudnnBatchNormMode_t,
+        bnOps: cuda_types::cudnn8::cudnnBatchNormOps_t,
+        alpha: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        xData: *const ::core::ffi::c_void,
+        zDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        zData: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        yData: *mut ::core::ffi::c_void,
+        bnScaleBiasMeanVarDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        bnScale: *const ::core::ffi::c_void,
+        bnBias: *const ::core::ffi::c_void,
+        exponentialAverageFactor: f64,
+        resultRunningMean: *mut ::core::ffi::c_void,
+        resultRunningVariance: *mut ::core::ffi::c_void,
+        epsilon: f64,
+        resultSaveMean: *mut ::core::ffi::c_void,
+        resultSaveInvVariance: *mut ::core::ffi::c_void,
+        activationDesc: cuda_types::cudnn8::cudnnActivationDescriptor_t,
+        workspace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBatchNormalizationBackward(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        mode: cuda_types::cudnn8::cudnnBatchNormMode_t,
+        alphaDataDiff: *const ::core::ffi::c_void,
+        betaDataDiff: *const ::core::ffi::c_void,
+        alphaParamDiff: *const ::core::ffi::c_void,
+        betaParamDiff: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        dxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+        dBnScaleBiasDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        bnScale: *const ::core::ffi::c_void,
+        dBnScaleResult: *mut ::core::ffi::c_void,
+        dBnBiasResult: *mut ::core::ffi::c_void,
+        epsilon: f64,
+        savedMean: *const ::core::ffi::c_void,
+        savedInvVariance: *const ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBatchNormalizationBackwardEx(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        mode: cuda_types::cudnn8::cudnnBatchNormMode_t,
+        bnOps: cuda_types::cudnn8::cudnnBatchNormOps_t,
+        alphaDataDiff: *const ::core::ffi::c_void,
+        betaDataDiff: *const ::core::ffi::c_void,
+        alphaParamDiff: *const ::core::ffi::c_void,
+        betaParamDiff: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        xData: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        yData: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dyData: *const ::core::ffi::c_void,
+        dzDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dzData: *mut ::core::ffi::c_void,
+        dxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dxData: *mut ::core::ffi::c_void,
+        dBnScaleBiasDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        bnScaleData: *const ::core::ffi::c_void,
+        bnBiasData: *const ::core::ffi::c_void,
+        dBnScaleData: *mut ::core::ffi::c_void,
+        dBnBiasData: *mut ::core::ffi::c_void,
+        epsilon: f64,
+        savedMean: *const ::core::ffi::c_void,
+        savedInvVariance: *const ::core::ffi::c_void,
+        activationDesc: cuda_types::cudnn8::cudnnActivationDescriptor_t,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetNormalizationForwardTrainingWorkspaceSize(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        mode: cuda_types::cudnn8::cudnnNormMode_t,
+        normOps: cuda_types::cudnn8::cudnnNormOps_t,
+        algo: cuda_types::cudnn8::cudnnNormAlgo_t,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        zDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        normScaleBiasDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        activationDesc: cuda_types::cudnn8::cudnnActivationDescriptor_t,
+        normMeanVarDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        sizeInBytes: *mut usize,
+        groupCnt: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetNormalizationBackwardWorkspaceSize(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        mode: cuda_types::cudnn8::cudnnNormMode_t,
+        normOps: cuda_types::cudnn8::cudnnNormOps_t,
+        algo: cuda_types::cudnn8::cudnnNormAlgo_t,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dzDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dNormScaleBiasDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        activationDesc: cuda_types::cudnn8::cudnnActivationDescriptor_t,
+        normMeanVarDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        sizeInBytes: *mut usize,
+        groupCnt: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetNormalizationTrainingReserveSpaceSize(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        mode: cuda_types::cudnn8::cudnnNormMode_t,
+        normOps: cuda_types::cudnn8::cudnnNormOps_t,
+        algo: cuda_types::cudnn8::cudnnNormAlgo_t,
+        activationDesc: cuda_types::cudnn8::cudnnActivationDescriptor_t,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        sizeInBytes: *mut usize,
+        groupCnt: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnNormalizationForwardTraining(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        mode: cuda_types::cudnn8::cudnnNormMode_t,
+        normOps: cuda_types::cudnn8::cudnnNormOps_t,
+        algo: cuda_types::cudnn8::cudnnNormAlgo_t,
+        alpha: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        xData: *const ::core::ffi::c_void,
+        normScaleBiasDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        normScale: *const ::core::ffi::c_void,
+        normBias: *const ::core::ffi::c_void,
+        exponentialAverageFactor: f64,
+        normMeanVarDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        resultRunningMean: *mut ::core::ffi::c_void,
+        resultRunningVariance: *mut ::core::ffi::c_void,
+        epsilon: f64,
+        resultSaveMean: *mut ::core::ffi::c_void,
+        resultSaveInvVariance: *mut ::core::ffi::c_void,
+        activationDesc: cuda_types::cudnn8::cudnnActivationDescriptor_t,
+        zDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        zData: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        yData: *mut ::core::ffi::c_void,
+        workspace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+        groupCnt: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnNormalizationBackward(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        mode: cuda_types::cudnn8::cudnnNormMode_t,
+        normOps: cuda_types::cudnn8::cudnnNormOps_t,
+        algo: cuda_types::cudnn8::cudnnNormAlgo_t,
+        alphaDataDiff: *const ::core::ffi::c_void,
+        betaDataDiff: *const ::core::ffi::c_void,
+        alphaParamDiff: *const ::core::ffi::c_void,
+        betaParamDiff: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        xData: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        yData: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dyData: *const ::core::ffi::c_void,
+        dzDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dzData: *mut ::core::ffi::c_void,
+        dxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dxData: *mut ::core::ffi::c_void,
+        dNormScaleBiasDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        normScaleData: *const ::core::ffi::c_void,
+        normBiasData: *const ::core::ffi::c_void,
+        dNormScaleData: *mut ::core::ffi::c_void,
+        dNormBiasData: *mut ::core::ffi::c_void,
+        epsilon: f64,
+        normMeanVarDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        savedMean: *const ::core::ffi::c_void,
+        savedInvVariance: *const ::core::ffi::c_void,
+        activationDesc: cuda_types::cudnn8::cudnnActivationDescriptor_t,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+        groupCnt: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSpatialTfGridGeneratorBackward(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        stDesc: cuda_types::cudnn8::cudnnSpatialTransformerDescriptor_t,
+        dgrid: *const ::core::ffi::c_void,
+        dtheta: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSpatialTfSamplerBackward(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        stDesc: cuda_types::cudnn8::cudnnSpatialTransformerDescriptor_t,
+        alpha: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        dxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+        alphaDgrid: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        grid: *const ::core::ffi::c_void,
+        betaDgrid: *const ::core::ffi::c_void,
+        dgrid: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDropoutBackward(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        dropoutDesc: cuda_types::cudnn8::cudnnDropoutDescriptor_t,
+        dydesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        dxdesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+        reserveSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnOpsTrainVersionCheck() -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateRNNDescriptor(
+        rnnDesc: *mut cuda_types::cudnn8::cudnnRNNDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyRNNDescriptor(
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetRNNDescriptor_v8(
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        algo: cuda_types::cudnn8::cudnnRNNAlgo_t,
+        cellMode: cuda_types::cudnn8::cudnnRNNMode_t,
+        biasMode: cuda_types::cudnn8::cudnnRNNBiasMode_t,
+        dirMode: cuda_types::cudnn8::cudnnDirectionMode_t,
+        inputMode: cuda_types::cudnn8::cudnnRNNInputMode_t,
+        dataType: cuda_types::cudnn8::cudnnDataType_t,
+        mathPrec: cuda_types::cudnn8::cudnnDataType_t,
+        mathType: cuda_types::cudnn8::cudnnMathType_t,
+        inputSize: i32,
+        hiddenSize: i32,
+        projSize: i32,
+        numLayers: i32,
+        dropoutDesc: cuda_types::cudnn8::cudnnDropoutDescriptor_t,
+        auxFlags: u32,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNDescriptor_v8(
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        algo: *mut cuda_types::cudnn8::cudnnRNNAlgo_t,
+        cellMode: *mut cuda_types::cudnn8::cudnnRNNMode_t,
+        biasMode: *mut cuda_types::cudnn8::cudnnRNNBiasMode_t,
+        dirMode: *mut cuda_types::cudnn8::cudnnDirectionMode_t,
+        inputMode: *mut cuda_types::cudnn8::cudnnRNNInputMode_t,
+        dataType: *mut cuda_types::cudnn8::cudnnDataType_t,
+        mathPrec: *mut cuda_types::cudnn8::cudnnDataType_t,
+        mathType: *mut cuda_types::cudnn8::cudnnMathType_t,
+        inputSize: *mut i32,
+        hiddenSize: *mut i32,
+        projSize: *mut i32,
+        numLayers: *mut i32,
+        dropoutDesc: *mut cuda_types::cudnn8::cudnnDropoutDescriptor_t,
+        auxFlags: *mut u32,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetRNNDescriptor_v6(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        hiddenSize: ::core::ffi::c_int,
+        numLayers: ::core::ffi::c_int,
+        dropoutDesc: cuda_types::cudnn8::cudnnDropoutDescriptor_t,
+        inputMode: cuda_types::cudnn8::cudnnRNNInputMode_t,
+        direction: cuda_types::cudnn8::cudnnDirectionMode_t,
+        cellMode: cuda_types::cudnn8::cudnnRNNMode_t,
+        algo: cuda_types::cudnn8::cudnnRNNAlgo_t,
+        mathPrec: cuda_types::cudnn8::cudnnDataType_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNDescriptor_v6(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        hiddenSize: *mut ::core::ffi::c_int,
+        numLayers: *mut ::core::ffi::c_int,
+        dropoutDesc: *mut cuda_types::cudnn8::cudnnDropoutDescriptor_t,
+        inputMode: *mut cuda_types::cudnn8::cudnnRNNInputMode_t,
+        direction: *mut cuda_types::cudnn8::cudnnDirectionMode_t,
+        cellMode: *mut cuda_types::cudnn8::cudnnRNNMode_t,
+        algo: *mut cuda_types::cudnn8::cudnnRNNAlgo_t,
+        mathPrec: *mut cuda_types::cudnn8::cudnnDataType_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetRNNMatrixMathType(
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        mType: cuda_types::cudnn8::cudnnMathType_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNMatrixMathType(
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        mType: *mut cuda_types::cudnn8::cudnnMathType_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetRNNBiasMode(
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        biasMode: cuda_types::cudnn8::cudnnRNNBiasMode_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNBiasMode(
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        biasMode: *mut cuda_types::cudnn8::cudnnRNNBiasMode_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRNNSetClip_v8(
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        clipMode: cuda_types::cudnn8::cudnnRNNClipMode_t,
+        clipNanOpt: cuda_types::cudnn8::cudnnNanPropagation_t,
+        lclip: f64,
+        rclip: f64,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRNNGetClip_v8(
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        clipMode: *mut cuda_types::cudnn8::cudnnRNNClipMode_t,
+        clipNanOpt: *mut cuda_types::cudnn8::cudnnNanPropagation_t,
+        lclip: *mut f64,
+        rclip: *mut f64,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRNNSetClip(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        clipMode: cuda_types::cudnn8::cudnnRNNClipMode_t,
+        clipNanOpt: cuda_types::cudnn8::cudnnNanPropagation_t,
+        lclip: f64,
+        rclip: f64,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRNNGetClip(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        clipMode: *mut cuda_types::cudnn8::cudnnRNNClipMode_t,
+        clipNanOpt: *mut cuda_types::cudnn8::cudnnNanPropagation_t,
+        lclip: *mut f64,
+        rclip: *mut f64,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetRNNProjectionLayers(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        recProjSize: ::core::ffi::c_int,
+        outProjSize: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNProjectionLayers(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        recProjSize: *mut ::core::ffi::c_int,
+        outProjSize: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreatePersistentRNNPlan(
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        minibatch: ::core::ffi::c_int,
+        dataType: cuda_types::cudnn8::cudnnDataType_t,
+        plan: *mut cuda_types::cudnn8::cudnnPersistentRNNPlan_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyPersistentRNNPlan(
+        plan: cuda_types::cudnn8::cudnnPersistentRNNPlan_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetPersistentRNNPlan(
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        plan: cuda_types::cudnn8::cudnnPersistentRNNPlan_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBuildRNNDynamic(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        miniBatch: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNWorkspaceSize(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        seqLength: ::core::ffi::c_int,
+        xDesc: *const cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNTrainingReserveSize(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        seqLength: ::core::ffi::c_int,
+        xDesc: *const cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNTempSpaceSizes(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        fwdMode: cuda_types::cudnn8::cudnnForwardMode_t,
+        xDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        workSpaceSize: *mut usize,
+        reserveSpaceSize: *mut usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNParamsSize(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        sizeInBytes: *mut usize,
+        dataType: cuda_types::cudnn8::cudnnDataType_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNWeightSpaceSize(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        weightSpaceSize: *mut usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNLinLayerMatrixParams(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        pseudoLayer: ::core::ffi::c_int,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        wDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        w: *const ::core::ffi::c_void,
+        linLayerID: ::core::ffi::c_int,
+        linLayerMatDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        linLayerMat: *mut *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNLinLayerBiasParams(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        pseudoLayer: ::core::ffi::c_int,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        wDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        w: *const ::core::ffi::c_void,
+        linLayerID: ::core::ffi::c_int,
+        linLayerBiasDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        linLayerBias: *mut *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNWeightParams(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        pseudoLayer: i32,
+        weightSpaceSize: usize,
+        weightSpace: *const ::core::ffi::c_void,
+        linLayerID: i32,
+        mDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        mAddr: *mut *mut ::core::ffi::c_void,
+        bDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        bAddr: *mut *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRNNForwardInference(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        seqLength: ::core::ffi::c_int,
+        xDesc: *const cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        hxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        hx: *const ::core::ffi::c_void,
+        cxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        cx: *const ::core::ffi::c_void,
+        wDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        w: *const ::core::ffi::c_void,
+        yDesc: *const cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+        hyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        hy: *mut ::core::ffi::c_void,
+        cyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        cy: *mut ::core::ffi::c_void,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetRNNPaddingMode(
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        paddingMode: ::core::ffi::c_uint,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNPaddingMode(
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        paddingMode: *mut ::core::ffi::c_uint,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateRNNDataDescriptor(
+        rnnDataDesc: *mut cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyRNNDataDescriptor(
+        rnnDataDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetRNNDataDescriptor(
+        rnnDataDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        dataType: cuda_types::cudnn8::cudnnDataType_t,
+        layout: cuda_types::cudnn8::cudnnRNNDataLayout_t,
+        maxSeqLength: ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+        vectorSize: ::core::ffi::c_int,
+        seqLengthArray: *const ::core::ffi::c_int,
+        paddingFill: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNDataDescriptor(
+        rnnDataDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        dataType: *mut cuda_types::cudnn8::cudnnDataType_t,
+        layout: *mut cuda_types::cudnn8::cudnnRNNDataLayout_t,
+        maxSeqLength: *mut ::core::ffi::c_int,
+        batchSize: *mut ::core::ffi::c_int,
+        vectorSize: *mut ::core::ffi::c_int,
+        arrayLengthRequested: ::core::ffi::c_int,
+        seqLengthArray: *mut ::core::ffi::c_int,
+        paddingFill: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRNNForwardInferenceEx(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        xDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        hxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        hx: *const ::core::ffi::c_void,
+        cxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        cx: *const ::core::ffi::c_void,
+        wDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        w: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+        hyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        hy: *mut ::core::ffi::c_void,
+        cyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        cy: *mut ::core::ffi::c_void,
+        kDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        keys: *const ::core::ffi::c_void,
+        cDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        cAttn: *mut ::core::ffi::c_void,
+        iDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        iAttn: *mut ::core::ffi::c_void,
+        qDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        queries: *mut ::core::ffi::c_void,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRNNForward(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        fwdMode: cuda_types::cudnn8::cudnnForwardMode_t,
+        devSeqLengths: *const i32,
+        xDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+        hDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        hx: *const ::core::ffi::c_void,
+        hy: *mut ::core::ffi::c_void,
+        cDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        cx: *const ::core::ffi::c_void,
+        cy: *mut ::core::ffi::c_void,
+        weightSpaceSize: usize,
+        weightSpace: *const ::core::ffi::c_void,
+        workSpaceSize: usize,
+        workSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSize: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetRNNAlgorithmDescriptor(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        algoDesc: cuda_types::cudnn8::cudnnAlgorithmDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNForwardInferenceAlgorithmMaxCount(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        count: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnFindRNNForwardInferenceAlgorithmEx(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        seqLength: ::core::ffi::c_int,
+        xDesc: *const cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        hxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        hx: *const ::core::ffi::c_void,
+        cxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        cx: *const ::core::ffi::c_void,
+        wDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        w: *const ::core::ffi::c_void,
+        yDesc: *const cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+        hyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        hy: *mut ::core::ffi::c_void,
+        cyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        cy: *mut ::core::ffi::c_void,
+        findIntensity: f32,
+        requestedAlgoCount: ::core::ffi::c_int,
+        returnedAlgoCount: *mut ::core::ffi::c_int,
+        perfResults: *mut cuda_types::cudnn8::cudnnAlgorithmPerformance_t,
+        workspace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateSeqDataDescriptor(
+        seqDataDesc: *mut cuda_types::cudnn8::cudnnSeqDataDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroySeqDataDescriptor(
+        seqDataDesc: cuda_types::cudnn8::cudnnSeqDataDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetSeqDataDescriptor(
+        seqDataDesc: cuda_types::cudnn8::cudnnSeqDataDescriptor_t,
+        dataType: cuda_types::cudnn8::cudnnDataType_t,
+        nbDims: ::core::ffi::c_int,
+        dimA: *const ::core::ffi::c_int,
+        axes: *const cuda_types::cudnn8::cudnnSeqDataAxis_t,
+        seqLengthArraySize: usize,
+        seqLengthArray: *const ::core::ffi::c_int,
+        paddingFill: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetSeqDataDescriptor(
+        seqDataDesc: cuda_types::cudnn8::cudnnSeqDataDescriptor_t,
+        dataType: *mut cuda_types::cudnn8::cudnnDataType_t,
+        nbDims: *mut ::core::ffi::c_int,
+        nbDimsRequested: ::core::ffi::c_int,
+        dimA: *mut ::core::ffi::c_int,
+        axes: *mut cuda_types::cudnn8::cudnnSeqDataAxis_t,
+        seqLengthArraySize: *mut usize,
+        seqLengthSizeRequested: usize,
+        seqLengthArray: *mut ::core::ffi::c_int,
+        paddingFill: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateAttnDescriptor(
+        attnDesc: *mut cuda_types::cudnn8::cudnnAttnDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyAttnDescriptor(
+        attnDesc: cuda_types::cudnn8::cudnnAttnDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetAttnDescriptor(
+        attnDesc: cuda_types::cudnn8::cudnnAttnDescriptor_t,
+        attnMode: ::core::ffi::c_uint,
+        nHeads: ::core::ffi::c_int,
+        smScaler: f64,
+        dataType: cuda_types::cudnn8::cudnnDataType_t,
+        computePrec: cuda_types::cudnn8::cudnnDataType_t,
+        mathType: cuda_types::cudnn8::cudnnMathType_t,
+        attnDropoutDesc: cuda_types::cudnn8::cudnnDropoutDescriptor_t,
+        postDropoutDesc: cuda_types::cudnn8::cudnnDropoutDescriptor_t,
+        qSize: ::core::ffi::c_int,
+        kSize: ::core::ffi::c_int,
+        vSize: ::core::ffi::c_int,
+        qProjSize: ::core::ffi::c_int,
+        kProjSize: ::core::ffi::c_int,
+        vProjSize: ::core::ffi::c_int,
+        oProjSize: ::core::ffi::c_int,
+        qoMaxSeqLength: ::core::ffi::c_int,
+        kvMaxSeqLength: ::core::ffi::c_int,
+        maxBatchSize: ::core::ffi::c_int,
+        maxBeamSize: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetAttnDescriptor(
+        attnDesc: cuda_types::cudnn8::cudnnAttnDescriptor_t,
+        attnMode: *mut ::core::ffi::c_uint,
+        nHeads: *mut ::core::ffi::c_int,
+        smScaler: *mut f64,
+        dataType: *mut cuda_types::cudnn8::cudnnDataType_t,
+        computePrec: *mut cuda_types::cudnn8::cudnnDataType_t,
+        mathType: *mut cuda_types::cudnn8::cudnnMathType_t,
+        attnDropoutDesc: *mut cuda_types::cudnn8::cudnnDropoutDescriptor_t,
+        postDropoutDesc: *mut cuda_types::cudnn8::cudnnDropoutDescriptor_t,
+        qSize: *mut ::core::ffi::c_int,
+        kSize: *mut ::core::ffi::c_int,
+        vSize: *mut ::core::ffi::c_int,
+        qProjSize: *mut ::core::ffi::c_int,
+        kProjSize: *mut ::core::ffi::c_int,
+        vProjSize: *mut ::core::ffi::c_int,
+        oProjSize: *mut ::core::ffi::c_int,
+        qoMaxSeqLength: *mut ::core::ffi::c_int,
+        kvMaxSeqLength: *mut ::core::ffi::c_int,
+        maxBatchSize: *mut ::core::ffi::c_int,
+        maxBeamSize: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetMultiHeadAttnBuffers(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        attnDesc: cuda_types::cudnn8::cudnnAttnDescriptor_t,
+        weightSizeInBytes: *mut usize,
+        workSpaceSizeInBytes: *mut usize,
+        reserveSpaceSizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetMultiHeadAttnWeights(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        attnDesc: cuda_types::cudnn8::cudnnAttnDescriptor_t,
+        wKind: cuda_types::cudnn8::cudnnMultiHeadAttnWeightKind_t,
+        weightSizeInBytes: usize,
+        weights: *const ::core::ffi::c_void,
+        wDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        wAddr: *mut *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnMultiHeadAttnForward(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        attnDesc: cuda_types::cudnn8::cudnnAttnDescriptor_t,
+        currIdx: ::core::ffi::c_int,
+        loWinIdx: *const ::core::ffi::c_int,
+        hiWinIdx: *const ::core::ffi::c_int,
+        devSeqLengthsQO: *const ::core::ffi::c_int,
+        devSeqLengthsKV: *const ::core::ffi::c_int,
+        qDesc: cuda_types::cudnn8::cudnnSeqDataDescriptor_t,
+        queries: *const ::core::ffi::c_void,
+        residuals: *const ::core::ffi::c_void,
+        kDesc: cuda_types::cudnn8::cudnnSeqDataDescriptor_t,
+        keys: *const ::core::ffi::c_void,
+        vDesc: cuda_types::cudnn8::cudnnSeqDataDescriptor_t,
+        values: *const ::core::ffi::c_void,
+        oDesc: cuda_types::cudnn8::cudnnSeqDataDescriptor_t,
+        out: *mut ::core::ffi::c_void,
+        weightSizeInBytes: usize,
+        weights: *const ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        workSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnAdvInferVersionCheck() -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRNNForwardTraining(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        seqLength: ::core::ffi::c_int,
+        xDesc: *const cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        hxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        hx: *const ::core::ffi::c_void,
+        cxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        cx: *const ::core::ffi::c_void,
+        wDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        w: *const ::core::ffi::c_void,
+        yDesc: *const cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+        hyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        hy: *mut ::core::ffi::c_void,
+        cyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        cy: *mut ::core::ffi::c_void,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRNNBackwardData(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        seqLength: ::core::ffi::c_int,
+        yDesc: *const cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *const ::core::ffi::c_void,
+        dyDesc: *const cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        dhyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dhy: *const ::core::ffi::c_void,
+        dcyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dcy: *const ::core::ffi::c_void,
+        wDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        w: *const ::core::ffi::c_void,
+        hxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        hx: *const ::core::ffi::c_void,
+        cxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        cx: *const ::core::ffi::c_void,
+        dxDesc: *const cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+        dhxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dhx: *mut ::core::ffi::c_void,
+        dcxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dcx: *mut ::core::ffi::c_void,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRNNBackwardData_v8(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        devSeqLengths: *const i32,
+        yDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        y: *const ::core::ffi::c_void,
+        dy: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+        hDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        hx: *const ::core::ffi::c_void,
+        dhy: *const ::core::ffi::c_void,
+        dhx: *mut ::core::ffi::c_void,
+        cDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        cx: *const ::core::ffi::c_void,
+        dcy: *const ::core::ffi::c_void,
+        dcx: *mut ::core::ffi::c_void,
+        weightSpaceSize: usize,
+        weightSpace: *const ::core::ffi::c_void,
+        workSpaceSize: usize,
+        workSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSize: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRNNBackwardWeights(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        seqLength: ::core::ffi::c_int,
+        xDesc: *const cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        hxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        hx: *const ::core::ffi::c_void,
+        yDesc: *const cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *const ::core::ffi::c_void,
+        workSpace: *const ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        dwDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        dw: *mut ::core::ffi::c_void,
+        reserveSpace: *const ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRNNBackwardWeights_v8(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        addGrad: cuda_types::cudnn8::cudnnWgradMode_t,
+        devSeqLengths: *const i32,
+        xDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        hDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        hx: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        y: *const ::core::ffi::c_void,
+        weightSpaceSize: usize,
+        dweightSpace: *mut ::core::ffi::c_void,
+        workSpaceSize: usize,
+        workSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSize: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRNNForwardTrainingEx(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        xDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        hxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        hx: *const ::core::ffi::c_void,
+        cxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        cx: *const ::core::ffi::c_void,
+        wDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        w: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+        hyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        hy: *mut ::core::ffi::c_void,
+        cyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        cy: *mut ::core::ffi::c_void,
+        kDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        keys: *const ::core::ffi::c_void,
+        cDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        cAttn: *mut ::core::ffi::c_void,
+        iDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        iAttn: *mut ::core::ffi::c_void,
+        qDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        queries: *mut ::core::ffi::c_void,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRNNBackwardDataEx(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        yDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        y: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        dcDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        dcAttn: *const ::core::ffi::c_void,
+        dhyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dhy: *const ::core::ffi::c_void,
+        dcyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dcy: *const ::core::ffi::c_void,
+        wDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        w: *const ::core::ffi::c_void,
+        hxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        hx: *const ::core::ffi::c_void,
+        cxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        cx: *const ::core::ffi::c_void,
+        dxDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+        dhxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dhx: *mut ::core::ffi::c_void,
+        dcxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dcx: *mut ::core::ffi::c_void,
+        dkDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        dkeys: *mut ::core::ffi::c_void,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRNNBackwardWeightsEx(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        xDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        hxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        hx: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn8::cudnnRNNDataDescriptor_t,
+        y: *const ::core::ffi::c_void,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        dwDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        dw: *mut ::core::ffi::c_void,
+        reserveSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNForwardTrainingAlgorithmMaxCount(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        count: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnFindRNNForwardTrainingAlgorithmEx(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        seqLength: ::core::ffi::c_int,
+        xDesc: *const cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        hxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        hx: *const ::core::ffi::c_void,
+        cxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        cx: *const ::core::ffi::c_void,
+        wDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        w: *const ::core::ffi::c_void,
+        yDesc: *const cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+        hyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        hy: *mut ::core::ffi::c_void,
+        cyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        cy: *mut ::core::ffi::c_void,
+        findIntensity: f32,
+        requestedAlgoCount: ::core::ffi::c_int,
+        returnedAlgoCount: *mut ::core::ffi::c_int,
+        perfResults: *mut cuda_types::cudnn8::cudnnAlgorithmPerformance_t,
+        workspace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNBackwardDataAlgorithmMaxCount(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        count: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnFindRNNBackwardDataAlgorithmEx(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        seqLength: ::core::ffi::c_int,
+        yDesc: *const cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *const ::core::ffi::c_void,
+        dyDesc: *const cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        dhyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dhy: *const ::core::ffi::c_void,
+        dcyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dcy: *const ::core::ffi::c_void,
+        wDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        w: *const ::core::ffi::c_void,
+        hxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        hx: *const ::core::ffi::c_void,
+        cxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        cx: *const ::core::ffi::c_void,
+        dxDesc: *const cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+        dhxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dhx: *mut ::core::ffi::c_void,
+        dcxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dcx: *mut ::core::ffi::c_void,
+        findIntensity: f32,
+        requestedAlgoCount: ::core::ffi::c_int,
+        returnedAlgoCount: *mut ::core::ffi::c_int,
+        perfResults: *mut cuda_types::cudnn8::cudnnAlgorithmPerformance_t,
+        workspace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNBackwardWeightsAlgorithmMaxCount(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        count: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnFindRNNBackwardWeightsAlgorithmEx(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn8::cudnnRNNDescriptor_t,
+        seqLength: ::core::ffi::c_int,
+        xDesc: *const cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        hxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        hx: *const ::core::ffi::c_void,
+        yDesc: *const cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *const ::core::ffi::c_void,
+        findIntensity: f32,
+        requestedAlgoCount: ::core::ffi::c_int,
+        returnedAlgoCount: *mut ::core::ffi::c_int,
+        perfResults: *mut cuda_types::cudnn8::cudnnAlgorithmPerformance_t,
+        workspace: *const ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        dwDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        dw: *mut ::core::ffi::c_void,
+        reserveSpace: *const ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnMultiHeadAttnBackwardData(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        attnDesc: cuda_types::cudnn8::cudnnAttnDescriptor_t,
+        loWinIdx: *const ::core::ffi::c_int,
+        hiWinIdx: *const ::core::ffi::c_int,
+        devSeqLengthsDQDO: *const ::core::ffi::c_int,
+        devSeqLengthsDKDV: *const ::core::ffi::c_int,
+        doDesc: cuda_types::cudnn8::cudnnSeqDataDescriptor_t,
+        dout: *const ::core::ffi::c_void,
+        dqDesc: cuda_types::cudnn8::cudnnSeqDataDescriptor_t,
+        dqueries: *mut ::core::ffi::c_void,
+        queries: *const ::core::ffi::c_void,
+        dkDesc: cuda_types::cudnn8::cudnnSeqDataDescriptor_t,
+        dkeys: *mut ::core::ffi::c_void,
+        keys: *const ::core::ffi::c_void,
+        dvDesc: cuda_types::cudnn8::cudnnSeqDataDescriptor_t,
+        dvalues: *mut ::core::ffi::c_void,
+        values: *const ::core::ffi::c_void,
+        weightSizeInBytes: usize,
+        weights: *const ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        workSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnMultiHeadAttnBackwardWeights(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        attnDesc: cuda_types::cudnn8::cudnnAttnDescriptor_t,
+        addGrad: cuda_types::cudnn8::cudnnWgradMode_t,
+        qDesc: cuda_types::cudnn8::cudnnSeqDataDescriptor_t,
+        queries: *const ::core::ffi::c_void,
+        kDesc: cuda_types::cudnn8::cudnnSeqDataDescriptor_t,
+        keys: *const ::core::ffi::c_void,
+        vDesc: cuda_types::cudnn8::cudnnSeqDataDescriptor_t,
+        values: *const ::core::ffi::c_void,
+        doDesc: cuda_types::cudnn8::cudnnSeqDataDescriptor_t,
+        dout: *const ::core::ffi::c_void,
+        weightSizeInBytes: usize,
+        weights: *const ::core::ffi::c_void,
+        dweights: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        workSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateCTCLossDescriptor(
+        ctcLossDesc: *mut cuda_types::cudnn8::cudnnCTCLossDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetCTCLossDescriptor(
+        ctcLossDesc: cuda_types::cudnn8::cudnnCTCLossDescriptor_t,
+        compType: cuda_types::cudnn8::cudnnDataType_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetCTCLossDescriptorEx(
+        ctcLossDesc: cuda_types::cudnn8::cudnnCTCLossDescriptor_t,
+        compType: cuda_types::cudnn8::cudnnDataType_t,
+        normMode: cuda_types::cudnn8::cudnnLossNormalizationMode_t,
+        gradMode: cuda_types::cudnn8::cudnnNanPropagation_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetCTCLossDescriptor_v8(
+        ctcLossDesc: cuda_types::cudnn8::cudnnCTCLossDescriptor_t,
+        compType: cuda_types::cudnn8::cudnnDataType_t,
+        normMode: cuda_types::cudnn8::cudnnLossNormalizationMode_t,
+        gradMode: cuda_types::cudnn8::cudnnNanPropagation_t,
+        maxLabelLength: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetCTCLossDescriptor(
+        ctcLossDesc: cuda_types::cudnn8::cudnnCTCLossDescriptor_t,
+        compType: *mut cuda_types::cudnn8::cudnnDataType_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetCTCLossDescriptorEx(
+        ctcLossDesc: cuda_types::cudnn8::cudnnCTCLossDescriptor_t,
+        compType: *mut cuda_types::cudnn8::cudnnDataType_t,
+        normMode: *mut cuda_types::cudnn8::cudnnLossNormalizationMode_t,
+        gradMode: *mut cuda_types::cudnn8::cudnnNanPropagation_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetCTCLossDescriptor_v8(
+        ctcLossDesc: cuda_types::cudnn8::cudnnCTCLossDescriptor_t,
+        compType: *mut cuda_types::cudnn8::cudnnDataType_t,
+        normMode: *mut cuda_types::cudnn8::cudnnLossNormalizationMode_t,
+        gradMode: *mut cuda_types::cudnn8::cudnnNanPropagation_t,
+        maxLabelLength: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyCTCLossDescriptor(
+        ctcLossDesc: cuda_types::cudnn8::cudnnCTCLossDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCTCLoss(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        probsDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        probs: *const ::core::ffi::c_void,
+        hostLabels: *const ::core::ffi::c_int,
+        hostLabelLengths: *const ::core::ffi::c_int,
+        hostInputLengths: *const ::core::ffi::c_int,
+        costs: *mut ::core::ffi::c_void,
+        gradientsDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        gradients: *mut ::core::ffi::c_void,
+        algo: cuda_types::cudnn8::cudnnCTCLossAlgo_t,
+        ctcLossDesc: cuda_types::cudnn8::cudnnCTCLossDescriptor_t,
+        workspace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCTCLoss_v8(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        algo: cuda_types::cudnn8::cudnnCTCLossAlgo_t,
+        ctcLossDesc: cuda_types::cudnn8::cudnnCTCLossDescriptor_t,
+        probsDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        probs: *const ::core::ffi::c_void,
+        labels: *const ::core::ffi::c_int,
+        labelLengths: *const ::core::ffi::c_int,
+        inputLengths: *const ::core::ffi::c_int,
+        costs: *mut ::core::ffi::c_void,
+        gradientsDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        gradients: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        workspace: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetCTCLossWorkspaceSize(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        probsDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        gradientsDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        labels: *const ::core::ffi::c_int,
+        labelLengths: *const ::core::ffi::c_int,
+        inputLengths: *const ::core::ffi::c_int,
+        algo: cuda_types::cudnn8::cudnnCTCLossAlgo_t,
+        ctcLossDesc: cuda_types::cudnn8::cudnnCTCLossDescriptor_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetCTCLossWorkspaceSize_v8(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        algo: cuda_types::cudnn8::cudnnCTCLossAlgo_t,
+        ctcLossDesc: cuda_types::cudnn8::cudnnCTCLossDescriptor_t,
+        probsDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        gradientsDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnAdvTrainVersionCheck() -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateConvolutionDescriptor(
+        convDesc: *mut cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyConvolutionDescriptor(
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetConvolutionMathType(
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        mathType: cuda_types::cudnn8::cudnnMathType_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionMathType(
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        mathType: *mut cuda_types::cudnn8::cudnnMathType_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetConvolutionGroupCount(
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        groupCount: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionGroupCount(
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        groupCount: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetConvolutionReorderType(
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        reorderType: cuda_types::cudnn8::cudnnReorderType_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionReorderType(
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        reorderType: *mut cuda_types::cudnn8::cudnnReorderType_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetConvolution2dDescriptor(
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        pad_h: ::core::ffi::c_int,
+        pad_w: ::core::ffi::c_int,
+        u: ::core::ffi::c_int,
+        v: ::core::ffi::c_int,
+        dilation_h: ::core::ffi::c_int,
+        dilation_w: ::core::ffi::c_int,
+        mode: cuda_types::cudnn8::cudnnConvolutionMode_t,
+        computeType: cuda_types::cudnn8::cudnnDataType_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolution2dDescriptor(
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        pad_h: *mut ::core::ffi::c_int,
+        pad_w: *mut ::core::ffi::c_int,
+        u: *mut ::core::ffi::c_int,
+        v: *mut ::core::ffi::c_int,
+        dilation_h: *mut ::core::ffi::c_int,
+        dilation_w: *mut ::core::ffi::c_int,
+        mode: *mut cuda_types::cudnn8::cudnnConvolutionMode_t,
+        computeType: *mut cuda_types::cudnn8::cudnnDataType_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetConvolutionNdDescriptor(
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        arrayLength: ::core::ffi::c_int,
+        padA: *const ::core::ffi::c_int,
+        filterStrideA: *const ::core::ffi::c_int,
+        dilationA: *const ::core::ffi::c_int,
+        mode: cuda_types::cudnn8::cudnnConvolutionMode_t,
+        computeType: cuda_types::cudnn8::cudnnDataType_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionNdDescriptor(
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        arrayLengthRequested: ::core::ffi::c_int,
+        arrayLength: *mut ::core::ffi::c_int,
+        padA: *mut ::core::ffi::c_int,
+        strideA: *mut ::core::ffi::c_int,
+        dilationA: *mut ::core::ffi::c_int,
+        mode: *mut cuda_types::cudnn8::cudnnConvolutionMode_t,
+        computeType: *mut cuda_types::cudnn8::cudnnDataType_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolution2dForwardOutputDim(
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        inputTensorDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        filterDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        n: *mut ::core::ffi::c_int,
+        c: *mut ::core::ffi::c_int,
+        h: *mut ::core::ffi::c_int,
+        w: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionNdForwardOutputDim(
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        inputTensorDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        filterDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        nbDims: ::core::ffi::c_int,
+        tensorOuputDimA: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionForwardAlgorithmMaxCount(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        count: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionForwardAlgorithm_v7(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        srcDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        filterDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        destDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        requestedAlgoCount: ::core::ffi::c_int,
+        returnedAlgoCount: *mut ::core::ffi::c_int,
+        perfResults: *mut cuda_types::cudnn8::cudnnConvolutionFwdAlgoPerf_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnFindConvolutionForwardAlgorithm(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        wDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        requestedAlgoCount: ::core::ffi::c_int,
+        returnedAlgoCount: *mut ::core::ffi::c_int,
+        perfResults: *mut cuda_types::cudnn8::cudnnConvolutionFwdAlgoPerf_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnFindConvolutionForwardAlgorithmEx(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        wDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        w: *const ::core::ffi::c_void,
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+        requestedAlgoCount: ::core::ffi::c_int,
+        returnedAlgoCount: *mut ::core::ffi::c_int,
+        perfResults: *mut cuda_types::cudnn8::cudnnConvolutionFwdAlgoPerf_t,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnIm2Col(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        wDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        colBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnReorderFilterAndBias(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        filterDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        reorderType: cuda_types::cudnn8::cudnnReorderType_t,
+        filterData: *const ::core::ffi::c_void,
+        reorderedFilterData: *mut ::core::ffi::c_void,
+        reorderBias: ::core::ffi::c_int,
+        biasData: *const ::core::ffi::c_void,
+        reorderedBiasData: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionForwardWorkspaceSize(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        wDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        algo: cuda_types::cudnn8::cudnnConvolutionFwdAlgo_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnConvolutionForward(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        alpha: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        wDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        w: *const ::core::ffi::c_void,
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        algo: cuda_types::cudnn8::cudnnConvolutionFwdAlgo_t,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        beta: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnConvolutionBiasActivationForward(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        alpha1: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        wDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        w: *const ::core::ffi::c_void,
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        algo: cuda_types::cudnn8::cudnnConvolutionFwdAlgo_t,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        alpha2: *const ::core::ffi::c_void,
+        zDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        z: *const ::core::ffi::c_void,
+        biasDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        bias: *const ::core::ffi::c_void,
+        activationDesc: cuda_types::cudnn8::cudnnActivationDescriptor_t,
+        yDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        count: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnFindConvolutionBackwardDataAlgorithm(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        wDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        dyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        dxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        requestedAlgoCount: ::core::ffi::c_int,
+        returnedAlgoCount: *mut ::core::ffi::c_int,
+        perfResults: *mut cuda_types::cudnn8::cudnnConvolutionBwdDataAlgoPerf_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnFindConvolutionBackwardDataAlgorithmEx(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        wDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        w: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        dxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+        requestedAlgoCount: ::core::ffi::c_int,
+        returnedAlgoCount: *mut ::core::ffi::c_int,
+        perfResults: *mut cuda_types::cudnn8::cudnnConvolutionBwdDataAlgoPerf_t,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionBackwardDataAlgorithm_v7(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        filterDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        diffDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        gradDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        requestedAlgoCount: ::core::ffi::c_int,
+        returnedAlgoCount: *mut ::core::ffi::c_int,
+        perfResults: *mut cuda_types::cudnn8::cudnnConvolutionBwdDataAlgoPerf_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionBackwardDataWorkspaceSize(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        wDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        dyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        dxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        algo: cuda_types::cudnn8::cudnnConvolutionBwdDataAlgo_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnConvolutionBackwardData(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        alpha: *const ::core::ffi::c_void,
+        wDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        w: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        algo: cuda_types::cudnn8::cudnnConvolutionBwdDataAlgo_t,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        beta: *const ::core::ffi::c_void,
+        dxDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetFoldedConvBackwardDataDescriptors(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        filterDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        diffDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        gradDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        transformFormat: cuda_types::cudnn8::cudnnTensorFormat_t,
+        foldedFilterDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        paddedDiffDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        foldedConvDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        foldedGradDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        filterFoldTransDesc: cuda_types::cudnn8::cudnnTensorTransformDescriptor_t,
+        diffPadTransDesc: cuda_types::cudnn8::cudnnTensorTransformDescriptor_t,
+        gradFoldTransDesc: cuda_types::cudnn8::cudnnTensorTransformDescriptor_t,
+        gradUnfoldTransDesc: cuda_types::cudnn8::cudnnTensorTransformDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCnnInferVersionCheck() -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        count: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnFindConvolutionBackwardFilterAlgorithm(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        dwDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        requestedAlgoCount: ::core::ffi::c_int,
+        returnedAlgoCount: *mut ::core::ffi::c_int,
+        perfResults: *mut cuda_types::cudnn8::cudnnConvolutionBwdFilterAlgoPerf_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnFindConvolutionBackwardFilterAlgorithmEx(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        y: *const ::core::ffi::c_void,
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        dwDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        dw: *mut ::core::ffi::c_void,
+        requestedAlgoCount: ::core::ffi::c_int,
+        returnedAlgoCount: *mut ::core::ffi::c_int,
+        perfResults: *mut cuda_types::cudnn8::cudnnConvolutionBwdFilterAlgoPerf_t,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionBackwardFilterAlgorithm_v7(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        srcDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        diffDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        gradDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        requestedAlgoCount: ::core::ffi::c_int,
+        returnedAlgoCount: *mut ::core::ffi::c_int,
+        perfResults: *mut cuda_types::cudnn8::cudnnConvolutionBwdFilterAlgoPerf_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionBackwardFilterWorkspaceSize(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        gradDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        algo: cuda_types::cudnn8::cudnnConvolutionBwdFilterAlgo_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnConvolutionBackwardFilter(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        alpha: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        convDesc: cuda_types::cudnn8::cudnnConvolutionDescriptor_t,
+        algo: cuda_types::cudnn8::cudnnConvolutionBwdFilterAlgo_t,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        beta: *const ::core::ffi::c_void,
+        dwDesc: cuda_types::cudnn8::cudnnFilterDescriptor_t,
+        dw: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnConvolutionBackwardBias(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        alpha: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        dbDesc: cuda_types::cudnn8::cudnnTensorDescriptor_t,
+        db: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateFusedOpsConstParamPack(
+        constPack: *mut cuda_types::cudnn8::cudnnFusedOpsConstParamPack_t,
+        ops: cuda_types::cudnn8::cudnnFusedOps_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyFusedOpsConstParamPack(
+        constPack: cuda_types::cudnn8::cudnnFusedOpsConstParamPack_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetFusedOpsConstParamPackAttribute(
+        constPack: cuda_types::cudnn8::cudnnFusedOpsConstParamPack_t,
+        paramLabel: cuda_types::cudnn8::cudnnFusedOpsConstParamLabel_t,
+        param: *const ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetFusedOpsConstParamPackAttribute(
+        constPack: cuda_types::cudnn8::cudnnFusedOpsConstParamPack_t,
+        paramLabel: cuda_types::cudnn8::cudnnFusedOpsConstParamLabel_t,
+        param: *mut ::core::ffi::c_void,
+        isNULL: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateFusedOpsVariantParamPack(
+        varPack: *mut cuda_types::cudnn8::cudnnFusedOpsVariantParamPack_t,
+        ops: cuda_types::cudnn8::cudnnFusedOps_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyFusedOpsVariantParamPack(
+        varPack: cuda_types::cudnn8::cudnnFusedOpsVariantParamPack_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetFusedOpsVariantParamPackAttribute(
+        varPack: cuda_types::cudnn8::cudnnFusedOpsVariantParamPack_t,
+        paramLabel: cuda_types::cudnn8::cudnnFusedOpsVariantParamLabel_t,
+        ptr: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetFusedOpsVariantParamPackAttribute(
+        varPack: cuda_types::cudnn8::cudnnFusedOpsVariantParamPack_t,
+        paramLabel: cuda_types::cudnn8::cudnnFusedOpsVariantParamLabel_t,
+        ptr: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateFusedOpsPlan(
+        plan: *mut cuda_types::cudnn8::cudnnFusedOpsPlan_t,
+        ops: cuda_types::cudnn8::cudnnFusedOps_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyFusedOpsPlan(
+        plan: cuda_types::cudnn8::cudnnFusedOpsPlan_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnMakeFusedOpsPlan(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        plan: cuda_types::cudnn8::cudnnFusedOpsPlan_t,
+        constPack: cuda_types::cudnn8::cudnnFusedOpsConstParamPack_t,
+        workspaceSizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnFusedOpsExecute(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        plan: cuda_types::cudnn8::cudnnFusedOpsPlan_t,
+        varPack: cuda_types::cudnn8::cudnnFusedOpsVariantParamPack_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCnnTrainVersionCheck() -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBackendCreateDescriptor(
+        descriptorType: cuda_types::cudnn8::cudnnBackendDescriptorType_t,
+        descriptor: *mut cuda_types::cudnn8::cudnnBackendDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBackendDestroyDescriptor(
+        descriptor: cuda_types::cudnn8::cudnnBackendDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBackendInitialize(
+        descriptor: cuda_types::cudnn8::cudnnBackendDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBackendFinalize(
+        descriptor: cuda_types::cudnn8::cudnnBackendDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBackendSetAttribute(
+        descriptor: cuda_types::cudnn8::cudnnBackendDescriptor_t,
+        attributeName: cuda_types::cudnn8::cudnnBackendAttributeName_t,
+        attributeType: cuda_types::cudnn8::cudnnBackendAttributeType_t,
+        elementCount: i64,
+        arrayOfElements: *const ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBackendGetAttribute(
+        descriptor: cuda_types::cudnn8::cudnnBackendDescriptor_t,
+        attributeName: cuda_types::cudnn8::cudnnBackendAttributeName_t,
+        attributeType: cuda_types::cudnn8::cudnnBackendAttributeType_t,
+        requestedElementCount: i64,
+        elementCount: *mut i64,
+        arrayOfElements: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBackendExecute(
+        handle: cuda_types::cudnn8::cudnnHandle_t,
+        executionPlan: cuda_types::cudnn8::cudnnBackendDescriptor_t,
+        variantPack: cuda_types::cudnn8::cudnnBackendDescriptor_t,
+    ) -> cuda_types::cudnn8::cudnnStatus_t;
+}
diff --git a/cuda_base/src/cudnn9.rs b/cuda_base/src/cudnn9.rs
new file mode 100644
index 0000000..b2c4b07
--- /dev/null
+++ b/cuda_base/src/cudnn9.rs
@@ -0,0 +1,2055 @@
+// Generated automatically by zluda_bindgen
+// DO NOT EDIT MANUALLY
+#![allow(warnings)]
+extern "system" {
+    fn cudnnGetVersion() -> usize;
+    fn cudnnGetMaxDeviceVersion() -> usize;
+    fn cudnnGetCudartVersion() -> usize;
+    fn cudnnGetErrorString(
+        status: cuda_types::cudnn9::cudnnStatus_t,
+    ) -> *const ::core::ffi::c_char;
+    fn cudnnGetLastErrorString(message: *mut ::core::ffi::c_char, max_size: usize) -> ();
+    #[must_use]
+    fn cudnnQueryRuntimeError(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        rstatus: *mut cuda_types::cudnn9::cudnnStatus_t,
+        mode: cuda_types::cudnn9::cudnnErrQueryMode_t,
+        tag: *mut cuda_types::cudnn9::cudnnRuntimeTag_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetProperty(
+        type_: cuda_types::cudnn9::libraryPropertyType,
+        value: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreate(
+        handle: *mut cuda_types::cudnn9::cudnnHandle_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroy(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetStream(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        streamId: cuda_types::cudnn9::cudaStream_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetStream(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        streamId: *mut cuda_types::cudnn9::cudaStream_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetCallback(
+        mask: ::core::ffi::c_uint,
+        udata: *mut ::core::ffi::c_void,
+        fptr: cuda_types::cudnn9::cudnnCallback_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetCallback(
+        mask: *mut ::core::ffi::c_uint,
+        udata: *mut *mut ::core::ffi::c_void,
+        fptr: *mut cuda_types::cudnn9::cudnnCallback_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGraphVersionCheck() -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBackendCreateDescriptor(
+        descriptorType: cuda_types::cudnn9::cudnnBackendDescriptorType_t,
+        descriptor: *mut cuda_types::cudnn9::cudnnBackendDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBackendDestroyDescriptor(
+        descriptor: cuda_types::cudnn9::cudnnBackendDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBackendInitialize(
+        descriptor: cuda_types::cudnn9::cudnnBackendDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBackendFinalize(
+        descriptor: cuda_types::cudnn9::cudnnBackendDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBackendSetAttribute(
+        descriptor: cuda_types::cudnn9::cudnnBackendDescriptor_t,
+        attributeName: cuda_types::cudnn9::cudnnBackendAttributeName_t,
+        attributeType: cuda_types::cudnn9::cudnnBackendAttributeType_t,
+        elementCount: i64,
+        arrayOfElements: *const ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBackendGetAttribute(
+        descriptor: cuda_types::cudnn9::cudnnBackendDescriptor_t,
+        attributeName: cuda_types::cudnn9::cudnnBackendAttributeName_t,
+        attributeType: cuda_types::cudnn9::cudnnBackendAttributeType_t,
+        requestedElementCount: i64,
+        elementCount: *mut i64,
+        arrayOfElements: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBackendExecute(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        executionPlan: cuda_types::cudnn9::cudnnBackendDescriptor_t,
+        variantPack: cuda_types::cudnn9::cudnnBackendDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBackendPopulateCudaGraph(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        executionPlan: cuda_types::cudnn9::cudnnBackendDescriptor_t,
+        variantPack: cuda_types::cudnn9::cudnnBackendDescriptor_t,
+        graph: cuda_types::cudnn9::cudaGraph_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBackendUpdateCudaGraph(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        executionPlan: cuda_types::cudnn9::cudnnBackendDescriptor_t,
+        variantPack: cuda_types::cudnn9::cudnnBackendDescriptor_t,
+        graph: cuda_types::cudnn9::cudaGraph_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateTensorDescriptor(
+        tensorDesc: *mut cuda_types::cudnn9::cudnnTensorDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetTensor4dDescriptor(
+        tensorDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        format: cuda_types::cudnn9::cudnnTensorFormat_t,
+        dataType: cuda_types::cudnn9::cudnnDataType_t,
+        n: ::core::ffi::c_int,
+        c: ::core::ffi::c_int,
+        h: ::core::ffi::c_int,
+        w: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetTensor4dDescriptorEx(
+        tensorDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dataType: cuda_types::cudnn9::cudnnDataType_t,
+        n: ::core::ffi::c_int,
+        c: ::core::ffi::c_int,
+        h: ::core::ffi::c_int,
+        w: ::core::ffi::c_int,
+        nStride: ::core::ffi::c_int,
+        cStride: ::core::ffi::c_int,
+        hStride: ::core::ffi::c_int,
+        wStride: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetTensor4dDescriptor(
+        tensorDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dataType: *mut cuda_types::cudnn9::cudnnDataType_t,
+        n: *mut ::core::ffi::c_int,
+        c: *mut ::core::ffi::c_int,
+        h: *mut ::core::ffi::c_int,
+        w: *mut ::core::ffi::c_int,
+        nStride: *mut ::core::ffi::c_int,
+        cStride: *mut ::core::ffi::c_int,
+        hStride: *mut ::core::ffi::c_int,
+        wStride: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetTensorNdDescriptor(
+        tensorDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dataType: cuda_types::cudnn9::cudnnDataType_t,
+        nbDims: ::core::ffi::c_int,
+        dimA: *const ::core::ffi::c_int,
+        strideA: *const ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetTensorNdDescriptorEx(
+        tensorDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        format: cuda_types::cudnn9::cudnnTensorFormat_t,
+        dataType: cuda_types::cudnn9::cudnnDataType_t,
+        nbDims: ::core::ffi::c_int,
+        dimA: *const ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetTensorNdDescriptor(
+        tensorDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        nbDimsRequested: ::core::ffi::c_int,
+        dataType: *mut cuda_types::cudnn9::cudnnDataType_t,
+        nbDims: *mut ::core::ffi::c_int,
+        dimA: *mut ::core::ffi::c_int,
+        strideA: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetTensorSizeInBytes(
+        tensorDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        size: *mut usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyTensorDescriptor(
+        tensorDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnInitTransformDest(
+        transformDesc: cuda_types::cudnn9::cudnnTensorTransformDescriptor_t,
+        srcDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        destDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        destSizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateTensorTransformDescriptor(
+        transformDesc: *mut cuda_types::cudnn9::cudnnTensorTransformDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetTensorTransformDescriptor(
+        transformDesc: cuda_types::cudnn9::cudnnTensorTransformDescriptor_t,
+        nbDims: u32,
+        destFormat: cuda_types::cudnn9::cudnnTensorFormat_t,
+        padBeforeA: *const i32,
+        padAfterA: *const i32,
+        foldA: *const u32,
+        direction: cuda_types::cudnn9::cudnnFoldingDirection_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetTensorTransformDescriptor(
+        transformDesc: cuda_types::cudnn9::cudnnTensorTransformDescriptor_t,
+        nbDimsRequested: u32,
+        destFormat: *mut cuda_types::cudnn9::cudnnTensorFormat_t,
+        padBeforeA: *mut i32,
+        padAfterA: *mut i32,
+        foldA: *mut u32,
+        direction: *mut cuda_types::cudnn9::cudnnFoldingDirection_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyTensorTransformDescriptor(
+        transformDesc: cuda_types::cudnn9::cudnnTensorTransformDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnTransformTensor(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        alpha: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnTransformTensorEx(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        transDesc: cuda_types::cudnn9::cudnnTensorTransformDescriptor_t,
+        alpha: *const ::core::ffi::c_void,
+        srcDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        srcData: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        destDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        destData: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnAddTensor(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        alpha: *const ::core::ffi::c_void,
+        aDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        A: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        cDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        C: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateOpTensorDescriptor(
+        opTensorDesc: *mut cuda_types::cudnn9::cudnnOpTensorDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetOpTensorDescriptor(
+        opTensorDesc: cuda_types::cudnn9::cudnnOpTensorDescriptor_t,
+        opTensorOp: cuda_types::cudnn9::cudnnOpTensorOp_t,
+        opTensorCompType: cuda_types::cudnn9::cudnnDataType_t,
+        opTensorNanOpt: cuda_types::cudnn9::cudnnNanPropagation_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetOpTensorDescriptor(
+        opTensorDesc: cuda_types::cudnn9::cudnnOpTensorDescriptor_t,
+        opTensorOp: *mut cuda_types::cudnn9::cudnnOpTensorOp_t,
+        opTensorCompType: *mut cuda_types::cudnn9::cudnnDataType_t,
+        opTensorNanOpt: *mut cuda_types::cudnn9::cudnnNanPropagation_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyOpTensorDescriptor(
+        opTensorDesc: cuda_types::cudnn9::cudnnOpTensorDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnOpTensor(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        opTensorDesc: cuda_types::cudnn9::cudnnOpTensorDescriptor_t,
+        alpha1: *const ::core::ffi::c_void,
+        aDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        A: *const ::core::ffi::c_void,
+        alpha2: *const ::core::ffi::c_void,
+        bDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        B: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        cDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        C: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateReduceTensorDescriptor(
+        reduceTensorDesc: *mut cuda_types::cudnn9::cudnnReduceTensorDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetReduceTensorDescriptor(
+        reduceTensorDesc: cuda_types::cudnn9::cudnnReduceTensorDescriptor_t,
+        reduceTensorOp: cuda_types::cudnn9::cudnnReduceTensorOp_t,
+        reduceTensorCompType: cuda_types::cudnn9::cudnnDataType_t,
+        reduceTensorNanOpt: cuda_types::cudnn9::cudnnNanPropagation_t,
+        reduceTensorIndices: cuda_types::cudnn9::cudnnReduceTensorIndices_t,
+        reduceTensorIndicesType: cuda_types::cudnn9::cudnnIndicesType_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetReduceTensorDescriptor(
+        reduceTensorDesc: cuda_types::cudnn9::cudnnReduceTensorDescriptor_t,
+        reduceTensorOp: *mut cuda_types::cudnn9::cudnnReduceTensorOp_t,
+        reduceTensorCompType: *mut cuda_types::cudnn9::cudnnDataType_t,
+        reduceTensorNanOpt: *mut cuda_types::cudnn9::cudnnNanPropagation_t,
+        reduceTensorIndices: *mut cuda_types::cudnn9::cudnnReduceTensorIndices_t,
+        reduceTensorIndicesType: *mut cuda_types::cudnn9::cudnnIndicesType_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyReduceTensorDescriptor(
+        reduceTensorDesc: cuda_types::cudnn9::cudnnReduceTensorDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetReductionIndicesSize(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        reduceTensorDesc: cuda_types::cudnn9::cudnnReduceTensorDescriptor_t,
+        aDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        cDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetReductionWorkspaceSize(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        reduceTensorDesc: cuda_types::cudnn9::cudnnReduceTensorDescriptor_t,
+        aDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        cDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnReduceTensor(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        reduceTensorDesc: cuda_types::cudnn9::cudnnReduceTensorDescriptor_t,
+        indices: *mut ::core::ffi::c_void,
+        indicesSizeInBytes: usize,
+        workspace: *mut ::core::ffi::c_void,
+        workspaceSizeInBytes: usize,
+        alpha: *const ::core::ffi::c_void,
+        aDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        A: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        cDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        C: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetTensor(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+        valuePtr: *const ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnScaleTensor(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+        alpha: *const ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateFilterDescriptor(
+        filterDesc: *mut cuda_types::cudnn9::cudnnFilterDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetFilter4dDescriptor(
+        filterDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        dataType: cuda_types::cudnn9::cudnnDataType_t,
+        format: cuda_types::cudnn9::cudnnTensorFormat_t,
+        k: ::core::ffi::c_int,
+        c: ::core::ffi::c_int,
+        h: ::core::ffi::c_int,
+        w: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetFilter4dDescriptor(
+        filterDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        dataType: *mut cuda_types::cudnn9::cudnnDataType_t,
+        format: *mut cuda_types::cudnn9::cudnnTensorFormat_t,
+        k: *mut ::core::ffi::c_int,
+        c: *mut ::core::ffi::c_int,
+        h: *mut ::core::ffi::c_int,
+        w: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetFilterNdDescriptor(
+        filterDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        dataType: cuda_types::cudnn9::cudnnDataType_t,
+        format: cuda_types::cudnn9::cudnnTensorFormat_t,
+        nbDims: ::core::ffi::c_int,
+        filterDimA: *const ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetFilterNdDescriptor(
+        filterDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        nbDimsRequested: ::core::ffi::c_int,
+        dataType: *mut cuda_types::cudnn9::cudnnDataType_t,
+        format: *mut cuda_types::cudnn9::cudnnTensorFormat_t,
+        nbDims: *mut ::core::ffi::c_int,
+        filterDimA: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetFilterSizeInBytes(
+        filterDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        size: *mut usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnTransformFilter(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        transDesc: cuda_types::cudnn9::cudnnTensorTransformDescriptor_t,
+        alpha: *const ::core::ffi::c_void,
+        srcDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        srcData: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        destDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        destData: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyFilterDescriptor(
+        filterDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSoftmaxForward(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        algo: cuda_types::cudnn9::cudnnSoftmaxAlgorithm_t,
+        mode: cuda_types::cudnn9::cudnnSoftmaxMode_t,
+        alpha: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreatePoolingDescriptor(
+        poolingDesc: *mut cuda_types::cudnn9::cudnnPoolingDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetPooling2dDescriptor(
+        poolingDesc: cuda_types::cudnn9::cudnnPoolingDescriptor_t,
+        mode: cuda_types::cudnn9::cudnnPoolingMode_t,
+        maxpoolingNanOpt: cuda_types::cudnn9::cudnnNanPropagation_t,
+        windowHeight: ::core::ffi::c_int,
+        windowWidth: ::core::ffi::c_int,
+        verticalPadding: ::core::ffi::c_int,
+        horizontalPadding: ::core::ffi::c_int,
+        verticalStride: ::core::ffi::c_int,
+        horizontalStride: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetPooling2dDescriptor(
+        poolingDesc: cuda_types::cudnn9::cudnnPoolingDescriptor_t,
+        mode: *mut cuda_types::cudnn9::cudnnPoolingMode_t,
+        maxpoolingNanOpt: *mut cuda_types::cudnn9::cudnnNanPropagation_t,
+        windowHeight: *mut ::core::ffi::c_int,
+        windowWidth: *mut ::core::ffi::c_int,
+        verticalPadding: *mut ::core::ffi::c_int,
+        horizontalPadding: *mut ::core::ffi::c_int,
+        verticalStride: *mut ::core::ffi::c_int,
+        horizontalStride: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetPoolingNdDescriptor(
+        poolingDesc: cuda_types::cudnn9::cudnnPoolingDescriptor_t,
+        mode: cuda_types::cudnn9::cudnnPoolingMode_t,
+        maxpoolingNanOpt: cuda_types::cudnn9::cudnnNanPropagation_t,
+        nbDims: ::core::ffi::c_int,
+        windowDimA: *const ::core::ffi::c_int,
+        paddingA: *const ::core::ffi::c_int,
+        strideA: *const ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetPoolingNdDescriptor(
+        poolingDesc: cuda_types::cudnn9::cudnnPoolingDescriptor_t,
+        nbDimsRequested: ::core::ffi::c_int,
+        mode: *mut cuda_types::cudnn9::cudnnPoolingMode_t,
+        maxpoolingNanOpt: *mut cuda_types::cudnn9::cudnnNanPropagation_t,
+        nbDims: *mut ::core::ffi::c_int,
+        windowDimA: *mut ::core::ffi::c_int,
+        paddingA: *mut ::core::ffi::c_int,
+        strideA: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetPoolingNdForwardOutputDim(
+        poolingDesc: cuda_types::cudnn9::cudnnPoolingDescriptor_t,
+        inputTensorDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        nbDims: ::core::ffi::c_int,
+        outputTensorDimA: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetPooling2dForwardOutputDim(
+        poolingDesc: cuda_types::cudnn9::cudnnPoolingDescriptor_t,
+        inputTensorDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        n: *mut ::core::ffi::c_int,
+        c: *mut ::core::ffi::c_int,
+        h: *mut ::core::ffi::c_int,
+        w: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyPoolingDescriptor(
+        poolingDesc: cuda_types::cudnn9::cudnnPoolingDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnPoolingForward(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        poolingDesc: cuda_types::cudnn9::cudnnPoolingDescriptor_t,
+        alpha: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateActivationDescriptor(
+        activationDesc: *mut cuda_types::cudnn9::cudnnActivationDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetActivationDescriptor(
+        activationDesc: cuda_types::cudnn9::cudnnActivationDescriptor_t,
+        mode: cuda_types::cudnn9::cudnnActivationMode_t,
+        reluNanOpt: cuda_types::cudnn9::cudnnNanPropagation_t,
+        coef: f64,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetActivationDescriptor(
+        activationDesc: cuda_types::cudnn9::cudnnActivationDescriptor_t,
+        mode: *mut cuda_types::cudnn9::cudnnActivationMode_t,
+        reluNanOpt: *mut cuda_types::cudnn9::cudnnNanPropagation_t,
+        coef: *mut f64,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetActivationDescriptorSwishBeta(
+        activationDesc: cuda_types::cudnn9::cudnnActivationDescriptor_t,
+        swish_beta: f64,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetActivationDescriptorSwishBeta(
+        activationDesc: cuda_types::cudnn9::cudnnActivationDescriptor_t,
+        swish_beta: *mut f64,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyActivationDescriptor(
+        activationDesc: cuda_types::cudnn9::cudnnActivationDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnActivationForward(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        activationDesc: cuda_types::cudnn9::cudnnActivationDescriptor_t,
+        alpha: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateLRNDescriptor(
+        normDesc: *mut cuda_types::cudnn9::cudnnLRNDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetLRNDescriptor(
+        normDesc: cuda_types::cudnn9::cudnnLRNDescriptor_t,
+        lrnN: ::core::ffi::c_uint,
+        lrnAlpha: f64,
+        lrnBeta: f64,
+        lrnK: f64,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetLRNDescriptor(
+        normDesc: cuda_types::cudnn9::cudnnLRNDescriptor_t,
+        lrnN: *mut ::core::ffi::c_uint,
+        lrnAlpha: *mut f64,
+        lrnBeta: *mut f64,
+        lrnK: *mut f64,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyLRNDescriptor(
+        lrnDesc: cuda_types::cudnn9::cudnnLRNDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnLRNCrossChannelForward(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        normDesc: cuda_types::cudnn9::cudnnLRNDescriptor_t,
+        lrnMode: cuda_types::cudnn9::cudnnLRNMode_t,
+        alpha: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDivisiveNormalizationForward(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        normDesc: cuda_types::cudnn9::cudnnLRNDescriptor_t,
+        mode: cuda_types::cudnn9::cudnnDivNormMode_t,
+        alpha: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        means: *const ::core::ffi::c_void,
+        temp: *mut ::core::ffi::c_void,
+        temp2: *mut ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDeriveBNTensorDescriptor(
+        derivedBnDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        mode: cuda_types::cudnn9::cudnnBatchNormMode_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBatchNormalizationForwardInference(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        mode: cuda_types::cudnn9::cudnnBatchNormMode_t,
+        alpha: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+        bnScaleBiasMeanVarDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        bnScale: *const ::core::ffi::c_void,
+        bnBias: *const ::core::ffi::c_void,
+        estimatedMean: *const ::core::ffi::c_void,
+        estimatedVariance: *const ::core::ffi::c_void,
+        epsilon: f64,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDeriveNormTensorDescriptor(
+        derivedNormScaleBiasDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        derivedNormMeanVarDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        mode: cuda_types::cudnn9::cudnnNormMode_t,
+        groupCnt: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnNormalizationForwardInference(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        mode: cuda_types::cudnn9::cudnnNormMode_t,
+        normOps: cuda_types::cudnn9::cudnnNormOps_t,
+        algo: cuda_types::cudnn9::cudnnNormAlgo_t,
+        alpha: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        normScaleBiasDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        normScale: *const ::core::ffi::c_void,
+        normBias: *const ::core::ffi::c_void,
+        normMeanVarDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        estimatedMean: *const ::core::ffi::c_void,
+        estimatedVariance: *const ::core::ffi::c_void,
+        zDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        z: *const ::core::ffi::c_void,
+        activationDesc: cuda_types::cudnn9::cudnnActivationDescriptor_t,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+        epsilon: f64,
+        groupCnt: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateSpatialTransformerDescriptor(
+        stDesc: *mut cuda_types::cudnn9::cudnnSpatialTransformerDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetSpatialTransformerNdDescriptor(
+        stDesc: cuda_types::cudnn9::cudnnSpatialTransformerDescriptor_t,
+        samplerType: cuda_types::cudnn9::cudnnSamplerType_t,
+        dataType: cuda_types::cudnn9::cudnnDataType_t,
+        nbDims: ::core::ffi::c_int,
+        dimA: *const ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroySpatialTransformerDescriptor(
+        stDesc: cuda_types::cudnn9::cudnnSpatialTransformerDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSpatialTfGridGeneratorForward(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        stDesc: cuda_types::cudnn9::cudnnSpatialTransformerDescriptor_t,
+        theta: *const ::core::ffi::c_void,
+        grid: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSpatialTfSamplerForward(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        stDesc: cuda_types::cudnn9::cudnnSpatialTransformerDescriptor_t,
+        alpha: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        grid: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateDropoutDescriptor(
+        dropoutDesc: *mut cuda_types::cudnn9::cudnnDropoutDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyDropoutDescriptor(
+        dropoutDesc: cuda_types::cudnn9::cudnnDropoutDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDropoutGetStatesSize(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDropoutGetReserveSpaceSize(
+        xdesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetDropoutDescriptor(
+        dropoutDesc: cuda_types::cudnn9::cudnnDropoutDescriptor_t,
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        dropout: f32,
+        states: *mut ::core::ffi::c_void,
+        stateSizeInBytes: usize,
+        seed: ::core::ffi::c_ulonglong,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRestoreDropoutDescriptor(
+        dropoutDesc: cuda_types::cudnn9::cudnnDropoutDescriptor_t,
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        dropout: f32,
+        states: *mut ::core::ffi::c_void,
+        stateSizeInBytes: usize,
+        seed: ::core::ffi::c_ulonglong,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetDropoutDescriptor(
+        dropoutDesc: cuda_types::cudnn9::cudnnDropoutDescriptor_t,
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        dropout: *mut f32,
+        states: *mut *mut ::core::ffi::c_void,
+        seed: *mut ::core::ffi::c_ulonglong,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDropoutForward(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        dropoutDesc: cuda_types::cudnn9::cudnnDropoutDescriptor_t,
+        xdesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        ydesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+        reserveSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnOpsVersionCheck() -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSoftmaxBackward(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        algo: cuda_types::cudnn9::cudnnSoftmaxAlgorithm_t,
+        mode: cuda_types::cudnn9::cudnnSoftmaxMode_t,
+        alpha: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        y: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        dxDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnPoolingBackward(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        poolingDesc: cuda_types::cudnn9::cudnnPoolingDescriptor_t,
+        alpha: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        y: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        dxDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnActivationBackward(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        activationDesc: cuda_types::cudnn9::cudnnActivationDescriptor_t,
+        alpha: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        y: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        dxDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnLRNCrossChannelBackward(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        normDesc: cuda_types::cudnn9::cudnnLRNDescriptor_t,
+        lrnMode: cuda_types::cudnn9::cudnnLRNMode_t,
+        alpha: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        y: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        dxDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDivisiveNormalizationBackward(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        normDesc: cuda_types::cudnn9::cudnnLRNDescriptor_t,
+        mode: cuda_types::cudnn9::cudnnDivNormMode_t,
+        alpha: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        means: *const ::core::ffi::c_void,
+        dy: *const ::core::ffi::c_void,
+        temp: *mut ::core::ffi::c_void,
+        temp2: *mut ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        dXdMeansDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+        dMeans: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        mode: cuda_types::cudnn9::cudnnBatchNormMode_t,
+        bnOps: cuda_types::cudnn9::cudnnBatchNormOps_t,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        zDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        bnScaleBiasMeanVarDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        activationDesc: cuda_types::cudnn9::cudnnActivationDescriptor_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetBatchNormalizationBackwardExWorkspaceSize(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        mode: cuda_types::cudnn9::cudnnBatchNormMode_t,
+        bnOps: cuda_types::cudnn9::cudnnBatchNormOps_t,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dyDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dzDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dxDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dBnScaleBiasDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        activationDesc: cuda_types::cudnn9::cudnnActivationDescriptor_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        mode: cuda_types::cudnn9::cudnnBatchNormMode_t,
+        bnOps: cuda_types::cudnn9::cudnnBatchNormOps_t,
+        activationDesc: cuda_types::cudnn9::cudnnActivationDescriptor_t,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBatchNormalizationForwardTraining(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        mode: cuda_types::cudnn9::cudnnBatchNormMode_t,
+        alpha: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+        bnScaleBiasMeanVarDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        bnScale: *const ::core::ffi::c_void,
+        bnBias: *const ::core::ffi::c_void,
+        exponentialAverageFactor: f64,
+        resultRunningMean: *mut ::core::ffi::c_void,
+        resultRunningVariance: *mut ::core::ffi::c_void,
+        epsilon: f64,
+        resultSaveMean: *mut ::core::ffi::c_void,
+        resultSaveInvVariance: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBatchNormalizationForwardTrainingEx(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        mode: cuda_types::cudnn9::cudnnBatchNormMode_t,
+        bnOps: cuda_types::cudnn9::cudnnBatchNormOps_t,
+        alpha: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        xData: *const ::core::ffi::c_void,
+        zDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        zData: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        yData: *mut ::core::ffi::c_void,
+        bnScaleBiasMeanVarDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        bnScale: *const ::core::ffi::c_void,
+        bnBias: *const ::core::ffi::c_void,
+        exponentialAverageFactor: f64,
+        resultRunningMean: *mut ::core::ffi::c_void,
+        resultRunningVariance: *mut ::core::ffi::c_void,
+        epsilon: f64,
+        resultSaveMean: *mut ::core::ffi::c_void,
+        resultSaveInvVariance: *mut ::core::ffi::c_void,
+        activationDesc: cuda_types::cudnn9::cudnnActivationDescriptor_t,
+        workspace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBatchNormalizationBackward(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        mode: cuda_types::cudnn9::cudnnBatchNormMode_t,
+        alphaDataDiff: *const ::core::ffi::c_void,
+        betaDataDiff: *const ::core::ffi::c_void,
+        alphaParamDiff: *const ::core::ffi::c_void,
+        betaParamDiff: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        dxDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+        dBnScaleBiasDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        bnScale: *const ::core::ffi::c_void,
+        dBnScaleResult: *mut ::core::ffi::c_void,
+        dBnBiasResult: *mut ::core::ffi::c_void,
+        epsilon: f64,
+        savedMean: *const ::core::ffi::c_void,
+        savedInvVariance: *const ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBatchNormalizationBackwardEx(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        mode: cuda_types::cudnn9::cudnnBatchNormMode_t,
+        bnOps: cuda_types::cudnn9::cudnnBatchNormOps_t,
+        alphaDataDiff: *const ::core::ffi::c_void,
+        betaDataDiff: *const ::core::ffi::c_void,
+        alphaParamDiff: *const ::core::ffi::c_void,
+        betaParamDiff: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        xData: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        yData: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dyData: *const ::core::ffi::c_void,
+        dzDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dzData: *mut ::core::ffi::c_void,
+        dxDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dxData: *mut ::core::ffi::c_void,
+        dBnScaleBiasDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        bnScaleData: *const ::core::ffi::c_void,
+        bnBiasData: *const ::core::ffi::c_void,
+        dBnScaleData: *mut ::core::ffi::c_void,
+        dBnBiasData: *mut ::core::ffi::c_void,
+        epsilon: f64,
+        savedMean: *const ::core::ffi::c_void,
+        savedInvVariance: *const ::core::ffi::c_void,
+        activationDesc: cuda_types::cudnn9::cudnnActivationDescriptor_t,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetNormalizationForwardTrainingWorkspaceSize(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        mode: cuda_types::cudnn9::cudnnNormMode_t,
+        normOps: cuda_types::cudnn9::cudnnNormOps_t,
+        algo: cuda_types::cudnn9::cudnnNormAlgo_t,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        zDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        normScaleBiasDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        activationDesc: cuda_types::cudnn9::cudnnActivationDescriptor_t,
+        normMeanVarDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        sizeInBytes: *mut usize,
+        groupCnt: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetNormalizationBackwardWorkspaceSize(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        mode: cuda_types::cudnn9::cudnnNormMode_t,
+        normOps: cuda_types::cudnn9::cudnnNormOps_t,
+        algo: cuda_types::cudnn9::cudnnNormAlgo_t,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dyDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dzDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dxDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dNormScaleBiasDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        activationDesc: cuda_types::cudnn9::cudnnActivationDescriptor_t,
+        normMeanVarDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        sizeInBytes: *mut usize,
+        groupCnt: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetNormalizationTrainingReserveSpaceSize(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        mode: cuda_types::cudnn9::cudnnNormMode_t,
+        normOps: cuda_types::cudnn9::cudnnNormOps_t,
+        algo: cuda_types::cudnn9::cudnnNormAlgo_t,
+        activationDesc: cuda_types::cudnn9::cudnnActivationDescriptor_t,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        sizeInBytes: *mut usize,
+        groupCnt: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnNormalizationForwardTraining(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        mode: cuda_types::cudnn9::cudnnNormMode_t,
+        normOps: cuda_types::cudnn9::cudnnNormOps_t,
+        algo: cuda_types::cudnn9::cudnnNormAlgo_t,
+        alpha: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        xData: *const ::core::ffi::c_void,
+        normScaleBiasDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        normScale: *const ::core::ffi::c_void,
+        normBias: *const ::core::ffi::c_void,
+        exponentialAverageFactor: f64,
+        normMeanVarDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        resultRunningMean: *mut ::core::ffi::c_void,
+        resultRunningVariance: *mut ::core::ffi::c_void,
+        epsilon: f64,
+        resultSaveMean: *mut ::core::ffi::c_void,
+        resultSaveInvVariance: *mut ::core::ffi::c_void,
+        activationDesc: cuda_types::cudnn9::cudnnActivationDescriptor_t,
+        zDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        zData: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        yData: *mut ::core::ffi::c_void,
+        workspace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+        groupCnt: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnNormalizationBackward(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        mode: cuda_types::cudnn9::cudnnNormMode_t,
+        normOps: cuda_types::cudnn9::cudnnNormOps_t,
+        algo: cuda_types::cudnn9::cudnnNormAlgo_t,
+        alphaDataDiff: *const ::core::ffi::c_void,
+        betaDataDiff: *const ::core::ffi::c_void,
+        alphaParamDiff: *const ::core::ffi::c_void,
+        betaParamDiff: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        xData: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        yData: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dyData: *const ::core::ffi::c_void,
+        dzDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dzData: *mut ::core::ffi::c_void,
+        dxDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dxData: *mut ::core::ffi::c_void,
+        dNormScaleBiasDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        normScaleData: *const ::core::ffi::c_void,
+        normBiasData: *const ::core::ffi::c_void,
+        dNormScaleData: *mut ::core::ffi::c_void,
+        dNormBiasData: *mut ::core::ffi::c_void,
+        epsilon: f64,
+        normMeanVarDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        savedMean: *const ::core::ffi::c_void,
+        savedInvVariance: *const ::core::ffi::c_void,
+        activationDesc: cuda_types::cudnn9::cudnnActivationDescriptor_t,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+        groupCnt: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSpatialTfGridGeneratorBackward(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        stDesc: cuda_types::cudnn9::cudnnSpatialTransformerDescriptor_t,
+        dgrid: *const ::core::ffi::c_void,
+        dtheta: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSpatialTfSamplerBackward(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        stDesc: cuda_types::cudnn9::cudnnSpatialTransformerDescriptor_t,
+        alpha: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        dxDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+        alphaDgrid: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        grid: *const ::core::ffi::c_void,
+        betaDgrid: *const ::core::ffi::c_void,
+        dgrid: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDropoutBackward(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        dropoutDesc: cuda_types::cudnn9::cudnnDropoutDescriptor_t,
+        dydesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        dxdesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+        reserveSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateRNNDescriptor(
+        rnnDesc: *mut cuda_types::cudnn9::cudnnRNNDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyRNNDescriptor(
+        rnnDesc: cuda_types::cudnn9::cudnnRNNDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetRNNDescriptor_v8(
+        rnnDesc: cuda_types::cudnn9::cudnnRNNDescriptor_t,
+        algo: cuda_types::cudnn9::cudnnRNNAlgo_t,
+        cellMode: cuda_types::cudnn9::cudnnRNNMode_t,
+        biasMode: cuda_types::cudnn9::cudnnRNNBiasMode_t,
+        dirMode: cuda_types::cudnn9::cudnnDirectionMode_t,
+        inputMode: cuda_types::cudnn9::cudnnRNNInputMode_t,
+        dataType: cuda_types::cudnn9::cudnnDataType_t,
+        mathPrec: cuda_types::cudnn9::cudnnDataType_t,
+        mathType: cuda_types::cudnn9::cudnnMathType_t,
+        inputSize: i32,
+        hiddenSize: i32,
+        projSize: i32,
+        numLayers: i32,
+        dropoutDesc: cuda_types::cudnn9::cudnnDropoutDescriptor_t,
+        auxFlags: u32,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNDescriptor_v8(
+        rnnDesc: cuda_types::cudnn9::cudnnRNNDescriptor_t,
+        algo: *mut cuda_types::cudnn9::cudnnRNNAlgo_t,
+        cellMode: *mut cuda_types::cudnn9::cudnnRNNMode_t,
+        biasMode: *mut cuda_types::cudnn9::cudnnRNNBiasMode_t,
+        dirMode: *mut cuda_types::cudnn9::cudnnDirectionMode_t,
+        inputMode: *mut cuda_types::cudnn9::cudnnRNNInputMode_t,
+        dataType: *mut cuda_types::cudnn9::cudnnDataType_t,
+        mathPrec: *mut cuda_types::cudnn9::cudnnDataType_t,
+        mathType: *mut cuda_types::cudnn9::cudnnMathType_t,
+        inputSize: *mut i32,
+        hiddenSize: *mut i32,
+        projSize: *mut i32,
+        numLayers: *mut i32,
+        dropoutDesc: *mut cuda_types::cudnn9::cudnnDropoutDescriptor_t,
+        auxFlags: *mut u32,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRNNSetClip_v8(
+        rnnDesc: cuda_types::cudnn9::cudnnRNNDescriptor_t,
+        clipMode: cuda_types::cudnn9::cudnnRNNClipMode_t,
+        clipNanOpt: cuda_types::cudnn9::cudnnNanPropagation_t,
+        lclip: f64,
+        rclip: f64,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRNNSetClip_v9(
+        rnnDesc: cuda_types::cudnn9::cudnnRNNDescriptor_t,
+        clipMode: cuda_types::cudnn9::cudnnRNNClipMode_t,
+        lclip: f64,
+        rclip: f64,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRNNGetClip_v8(
+        rnnDesc: cuda_types::cudnn9::cudnnRNNDescriptor_t,
+        clipMode: *mut cuda_types::cudnn9::cudnnRNNClipMode_t,
+        clipNanOpt: *mut cuda_types::cudnn9::cudnnNanPropagation_t,
+        lclip: *mut f64,
+        rclip: *mut f64,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRNNGetClip_v9(
+        rnnDesc: cuda_types::cudnn9::cudnnRNNDescriptor_t,
+        clipMode: *mut cuda_types::cudnn9::cudnnRNNClipMode_t,
+        lclip: *mut f64,
+        rclip: *mut f64,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnBuildRNNDynamic(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn9::cudnnRNNDescriptor_t,
+        miniBatch: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNTempSpaceSizes(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn9::cudnnRNNDescriptor_t,
+        fwdMode: cuda_types::cudnn9::cudnnForwardMode_t,
+        xDesc: cuda_types::cudnn9::cudnnRNNDataDescriptor_t,
+        workSpaceSize: *mut usize,
+        reserveSpaceSize: *mut usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNWeightSpaceSize(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn9::cudnnRNNDescriptor_t,
+        weightSpaceSize: *mut usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNWeightParams(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn9::cudnnRNNDescriptor_t,
+        pseudoLayer: i32,
+        weightSpaceSize: usize,
+        weightSpace: *const ::core::ffi::c_void,
+        linLayerID: i32,
+        mDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        mAddr: *mut *mut ::core::ffi::c_void,
+        bDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        bAddr: *mut *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateRNNDataDescriptor(
+        rnnDataDesc: *mut cuda_types::cudnn9::cudnnRNNDataDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyRNNDataDescriptor(
+        rnnDataDesc: cuda_types::cudnn9::cudnnRNNDataDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetRNNDataDescriptor(
+        rnnDataDesc: cuda_types::cudnn9::cudnnRNNDataDescriptor_t,
+        dataType: cuda_types::cudnn9::cudnnDataType_t,
+        layout: cuda_types::cudnn9::cudnnRNNDataLayout_t,
+        maxSeqLength: ::core::ffi::c_int,
+        batchSize: ::core::ffi::c_int,
+        vectorSize: ::core::ffi::c_int,
+        seqLengthArray: *const ::core::ffi::c_int,
+        paddingFill: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetRNNDataDescriptor(
+        rnnDataDesc: cuda_types::cudnn9::cudnnRNNDataDescriptor_t,
+        dataType: *mut cuda_types::cudnn9::cudnnDataType_t,
+        layout: *mut cuda_types::cudnn9::cudnnRNNDataLayout_t,
+        maxSeqLength: *mut ::core::ffi::c_int,
+        batchSize: *mut ::core::ffi::c_int,
+        vectorSize: *mut ::core::ffi::c_int,
+        arrayLengthRequested: ::core::ffi::c_int,
+        seqLengthArray: *mut ::core::ffi::c_int,
+        paddingFill: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRNNForward(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn9::cudnnRNNDescriptor_t,
+        fwdMode: cuda_types::cudnn9::cudnnForwardMode_t,
+        devSeqLengths: *const i32,
+        xDesc: cuda_types::cudnn9::cudnnRNNDataDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn9::cudnnRNNDataDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+        hDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        hx: *const ::core::ffi::c_void,
+        hy: *mut ::core::ffi::c_void,
+        cDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        cx: *const ::core::ffi::c_void,
+        cy: *mut ::core::ffi::c_void,
+        weightSpaceSize: usize,
+        weightSpace: *const ::core::ffi::c_void,
+        workSpaceSize: usize,
+        workSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSize: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateSeqDataDescriptor(
+        seqDataDesc: *mut cuda_types::cudnn9::cudnnSeqDataDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroySeqDataDescriptor(
+        seqDataDesc: cuda_types::cudnn9::cudnnSeqDataDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetSeqDataDescriptor(
+        seqDataDesc: cuda_types::cudnn9::cudnnSeqDataDescriptor_t,
+        dataType: cuda_types::cudnn9::cudnnDataType_t,
+        nbDims: ::core::ffi::c_int,
+        dimA: *const ::core::ffi::c_int,
+        axes: *const cuda_types::cudnn9::cudnnSeqDataAxis_t,
+        seqLengthArraySize: usize,
+        seqLengthArray: *const ::core::ffi::c_int,
+        paddingFill: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetSeqDataDescriptor(
+        seqDataDesc: cuda_types::cudnn9::cudnnSeqDataDescriptor_t,
+        dataType: *mut cuda_types::cudnn9::cudnnDataType_t,
+        nbDims: *mut ::core::ffi::c_int,
+        nbDimsRequested: ::core::ffi::c_int,
+        dimA: *mut ::core::ffi::c_int,
+        axes: *mut cuda_types::cudnn9::cudnnSeqDataAxis_t,
+        seqLengthArraySize: *mut usize,
+        seqLengthSizeRequested: usize,
+        seqLengthArray: *mut ::core::ffi::c_int,
+        paddingFill: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateAttnDescriptor(
+        attnDesc: *mut cuda_types::cudnn9::cudnnAttnDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyAttnDescriptor(
+        attnDesc: cuda_types::cudnn9::cudnnAttnDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetAttnDescriptor(
+        attnDesc: cuda_types::cudnn9::cudnnAttnDescriptor_t,
+        attnMode: ::core::ffi::c_uint,
+        nHeads: ::core::ffi::c_int,
+        smScaler: f64,
+        dataType: cuda_types::cudnn9::cudnnDataType_t,
+        computePrec: cuda_types::cudnn9::cudnnDataType_t,
+        mathType: cuda_types::cudnn9::cudnnMathType_t,
+        attnDropoutDesc: cuda_types::cudnn9::cudnnDropoutDescriptor_t,
+        postDropoutDesc: cuda_types::cudnn9::cudnnDropoutDescriptor_t,
+        qSize: ::core::ffi::c_int,
+        kSize: ::core::ffi::c_int,
+        vSize: ::core::ffi::c_int,
+        qProjSize: ::core::ffi::c_int,
+        kProjSize: ::core::ffi::c_int,
+        vProjSize: ::core::ffi::c_int,
+        oProjSize: ::core::ffi::c_int,
+        qoMaxSeqLength: ::core::ffi::c_int,
+        kvMaxSeqLength: ::core::ffi::c_int,
+        maxBatchSize: ::core::ffi::c_int,
+        maxBeamSize: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetAttnDescriptor(
+        attnDesc: cuda_types::cudnn9::cudnnAttnDescriptor_t,
+        attnMode: *mut ::core::ffi::c_uint,
+        nHeads: *mut ::core::ffi::c_int,
+        smScaler: *mut f64,
+        dataType: *mut cuda_types::cudnn9::cudnnDataType_t,
+        computePrec: *mut cuda_types::cudnn9::cudnnDataType_t,
+        mathType: *mut cuda_types::cudnn9::cudnnMathType_t,
+        attnDropoutDesc: *mut cuda_types::cudnn9::cudnnDropoutDescriptor_t,
+        postDropoutDesc: *mut cuda_types::cudnn9::cudnnDropoutDescriptor_t,
+        qSize: *mut ::core::ffi::c_int,
+        kSize: *mut ::core::ffi::c_int,
+        vSize: *mut ::core::ffi::c_int,
+        qProjSize: *mut ::core::ffi::c_int,
+        kProjSize: *mut ::core::ffi::c_int,
+        vProjSize: *mut ::core::ffi::c_int,
+        oProjSize: *mut ::core::ffi::c_int,
+        qoMaxSeqLength: *mut ::core::ffi::c_int,
+        kvMaxSeqLength: *mut ::core::ffi::c_int,
+        maxBatchSize: *mut ::core::ffi::c_int,
+        maxBeamSize: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetMultiHeadAttnBuffers(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        attnDesc: cuda_types::cudnn9::cudnnAttnDescriptor_t,
+        weightSizeInBytes: *mut usize,
+        workSpaceSizeInBytes: *mut usize,
+        reserveSpaceSizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetMultiHeadAttnWeights(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        attnDesc: cuda_types::cudnn9::cudnnAttnDescriptor_t,
+        wKind: cuda_types::cudnn9::cudnnMultiHeadAttnWeightKind_t,
+        weightSizeInBytes: usize,
+        weights: *const ::core::ffi::c_void,
+        wDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        wAddr: *mut *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnMultiHeadAttnForward(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        attnDesc: cuda_types::cudnn9::cudnnAttnDescriptor_t,
+        currIdx: ::core::ffi::c_int,
+        loWinIdx: *const ::core::ffi::c_int,
+        hiWinIdx: *const ::core::ffi::c_int,
+        devSeqLengthsQO: *const ::core::ffi::c_int,
+        devSeqLengthsKV: *const ::core::ffi::c_int,
+        qDesc: cuda_types::cudnn9::cudnnSeqDataDescriptor_t,
+        queries: *const ::core::ffi::c_void,
+        residuals: *const ::core::ffi::c_void,
+        kDesc: cuda_types::cudnn9::cudnnSeqDataDescriptor_t,
+        keys: *const ::core::ffi::c_void,
+        vDesc: cuda_types::cudnn9::cudnnSeqDataDescriptor_t,
+        values: *const ::core::ffi::c_void,
+        oDesc: cuda_types::cudnn9::cudnnSeqDataDescriptor_t,
+        out: *mut ::core::ffi::c_void,
+        weightSizeInBytes: usize,
+        weights: *const ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        workSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnAdvVersionCheck() -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRNNBackwardData_v8(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn9::cudnnRNNDescriptor_t,
+        devSeqLengths: *const i32,
+        yDesc: cuda_types::cudnn9::cudnnRNNDataDescriptor_t,
+        y: *const ::core::ffi::c_void,
+        dy: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnRNNDataDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+        hDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        hx: *const ::core::ffi::c_void,
+        dhy: *const ::core::ffi::c_void,
+        dhx: *mut ::core::ffi::c_void,
+        cDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        cx: *const ::core::ffi::c_void,
+        dcy: *const ::core::ffi::c_void,
+        dcx: *mut ::core::ffi::c_void,
+        weightSpaceSize: usize,
+        weightSpace: *const ::core::ffi::c_void,
+        workSpaceSize: usize,
+        workSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSize: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnRNNBackwardWeights_v8(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        rnnDesc: cuda_types::cudnn9::cudnnRNNDescriptor_t,
+        addGrad: cuda_types::cudnn9::cudnnWgradMode_t,
+        devSeqLengths: *const i32,
+        xDesc: cuda_types::cudnn9::cudnnRNNDataDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        hDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        hx: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn9::cudnnRNNDataDescriptor_t,
+        y: *const ::core::ffi::c_void,
+        weightSpaceSize: usize,
+        dweightSpace: *mut ::core::ffi::c_void,
+        workSpaceSize: usize,
+        workSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSize: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnMultiHeadAttnBackwardData(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        attnDesc: cuda_types::cudnn9::cudnnAttnDescriptor_t,
+        loWinIdx: *const ::core::ffi::c_int,
+        hiWinIdx: *const ::core::ffi::c_int,
+        devSeqLengthsDQDO: *const ::core::ffi::c_int,
+        devSeqLengthsDKDV: *const ::core::ffi::c_int,
+        doDesc: cuda_types::cudnn9::cudnnSeqDataDescriptor_t,
+        dout: *const ::core::ffi::c_void,
+        dqDesc: cuda_types::cudnn9::cudnnSeqDataDescriptor_t,
+        dqueries: *mut ::core::ffi::c_void,
+        queries: *const ::core::ffi::c_void,
+        dkDesc: cuda_types::cudnn9::cudnnSeqDataDescriptor_t,
+        dkeys: *mut ::core::ffi::c_void,
+        keys: *const ::core::ffi::c_void,
+        dvDesc: cuda_types::cudnn9::cudnnSeqDataDescriptor_t,
+        dvalues: *mut ::core::ffi::c_void,
+        values: *const ::core::ffi::c_void,
+        weightSizeInBytes: usize,
+        weights: *const ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        workSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnMultiHeadAttnBackwardWeights(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        attnDesc: cuda_types::cudnn9::cudnnAttnDescriptor_t,
+        addGrad: cuda_types::cudnn9::cudnnWgradMode_t,
+        qDesc: cuda_types::cudnn9::cudnnSeqDataDescriptor_t,
+        queries: *const ::core::ffi::c_void,
+        kDesc: cuda_types::cudnn9::cudnnSeqDataDescriptor_t,
+        keys: *const ::core::ffi::c_void,
+        vDesc: cuda_types::cudnn9::cudnnSeqDataDescriptor_t,
+        values: *const ::core::ffi::c_void,
+        doDesc: cuda_types::cudnn9::cudnnSeqDataDescriptor_t,
+        dout: *const ::core::ffi::c_void,
+        weightSizeInBytes: usize,
+        weights: *const ::core::ffi::c_void,
+        dweights: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        workSpace: *mut ::core::ffi::c_void,
+        reserveSpaceSizeInBytes: usize,
+        reserveSpace: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateCTCLossDescriptor(
+        ctcLossDesc: *mut cuda_types::cudnn9::cudnnCTCLossDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetCTCLossDescriptor(
+        ctcLossDesc: cuda_types::cudnn9::cudnnCTCLossDescriptor_t,
+        compType: cuda_types::cudnn9::cudnnDataType_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetCTCLossDescriptorEx(
+        ctcLossDesc: cuda_types::cudnn9::cudnnCTCLossDescriptor_t,
+        compType: cuda_types::cudnn9::cudnnDataType_t,
+        normMode: cuda_types::cudnn9::cudnnLossNormalizationMode_t,
+        gradMode: cuda_types::cudnn9::cudnnNanPropagation_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetCTCLossDescriptor_v8(
+        ctcLossDesc: cuda_types::cudnn9::cudnnCTCLossDescriptor_t,
+        compType: cuda_types::cudnn9::cudnnDataType_t,
+        normMode: cuda_types::cudnn9::cudnnLossNormalizationMode_t,
+        gradMode: cuda_types::cudnn9::cudnnNanPropagation_t,
+        maxLabelLength: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetCTCLossDescriptor_v9(
+        ctcLossDesc: cuda_types::cudnn9::cudnnCTCLossDescriptor_t,
+        compType: cuda_types::cudnn9::cudnnDataType_t,
+        normMode: cuda_types::cudnn9::cudnnLossNormalizationMode_t,
+        ctcGradMode: cuda_types::cudnn9::cudnnCTCGradMode_t,
+        maxLabelLength: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetCTCLossDescriptor(
+        ctcLossDesc: cuda_types::cudnn9::cudnnCTCLossDescriptor_t,
+        compType: *mut cuda_types::cudnn9::cudnnDataType_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetCTCLossDescriptorEx(
+        ctcLossDesc: cuda_types::cudnn9::cudnnCTCLossDescriptor_t,
+        compType: *mut cuda_types::cudnn9::cudnnDataType_t,
+        normMode: *mut cuda_types::cudnn9::cudnnLossNormalizationMode_t,
+        gradMode: *mut cuda_types::cudnn9::cudnnNanPropagation_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetCTCLossDescriptor_v8(
+        ctcLossDesc: cuda_types::cudnn9::cudnnCTCLossDescriptor_t,
+        compType: *mut cuda_types::cudnn9::cudnnDataType_t,
+        normMode: *mut cuda_types::cudnn9::cudnnLossNormalizationMode_t,
+        gradMode: *mut cuda_types::cudnn9::cudnnNanPropagation_t,
+        maxLabelLength: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetCTCLossDescriptor_v9(
+        ctcLossDesc: cuda_types::cudnn9::cudnnCTCLossDescriptor_t,
+        compType: *mut cuda_types::cudnn9::cudnnDataType_t,
+        normMode: *mut cuda_types::cudnn9::cudnnLossNormalizationMode_t,
+        ctcGradMode: *mut cuda_types::cudnn9::cudnnCTCGradMode_t,
+        maxLabelLength: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyCTCLossDescriptor(
+        ctcLossDesc: cuda_types::cudnn9::cudnnCTCLossDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCTCLoss(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        probsDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        probs: *const ::core::ffi::c_void,
+        hostLabels: *const ::core::ffi::c_int,
+        hostLabelLengths: *const ::core::ffi::c_int,
+        hostInputLengths: *const ::core::ffi::c_int,
+        costs: *mut ::core::ffi::c_void,
+        gradientsDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        gradients: *mut ::core::ffi::c_void,
+        algo: cuda_types::cudnn9::cudnnCTCLossAlgo_t,
+        ctcLossDesc: cuda_types::cudnn9::cudnnCTCLossDescriptor_t,
+        workspace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCTCLoss_v8(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        algo: cuda_types::cudnn9::cudnnCTCLossAlgo_t,
+        ctcLossDesc: cuda_types::cudnn9::cudnnCTCLossDescriptor_t,
+        probsDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        probs: *const ::core::ffi::c_void,
+        labels: *const ::core::ffi::c_int,
+        labelLengths: *const ::core::ffi::c_int,
+        inputLengths: *const ::core::ffi::c_int,
+        costs: *mut ::core::ffi::c_void,
+        gradientsDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        gradients: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        workspace: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetCTCLossWorkspaceSize(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        probsDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        gradientsDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        labels: *const ::core::ffi::c_int,
+        labelLengths: *const ::core::ffi::c_int,
+        inputLengths: *const ::core::ffi::c_int,
+        algo: cuda_types::cudnn9::cudnnCTCLossAlgo_t,
+        ctcLossDesc: cuda_types::cudnn9::cudnnCTCLossDescriptor_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetCTCLossWorkspaceSize_v8(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        algo: cuda_types::cudnn9::cudnnCTCLossAlgo_t,
+        ctcLossDesc: cuda_types::cudnn9::cudnnCTCLossDescriptor_t,
+        probsDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        gradientsDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateConvolutionDescriptor(
+        convDesc: *mut cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyConvolutionDescriptor(
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetConvolutionMathType(
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        mathType: cuda_types::cudnn9::cudnnMathType_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionMathType(
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        mathType: *mut cuda_types::cudnn9::cudnnMathType_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetConvolutionGroupCount(
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        groupCount: ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionGroupCount(
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        groupCount: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetConvolutionReorderType(
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        reorderType: cuda_types::cudnn9::cudnnReorderType_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionReorderType(
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        reorderType: *mut cuda_types::cudnn9::cudnnReorderType_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetConvolution2dDescriptor(
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        pad_h: ::core::ffi::c_int,
+        pad_w: ::core::ffi::c_int,
+        u: ::core::ffi::c_int,
+        v: ::core::ffi::c_int,
+        dilation_h: ::core::ffi::c_int,
+        dilation_w: ::core::ffi::c_int,
+        mode: cuda_types::cudnn9::cudnnConvolutionMode_t,
+        computeType: cuda_types::cudnn9::cudnnDataType_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolution2dDescriptor(
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        pad_h: *mut ::core::ffi::c_int,
+        pad_w: *mut ::core::ffi::c_int,
+        u: *mut ::core::ffi::c_int,
+        v: *mut ::core::ffi::c_int,
+        dilation_h: *mut ::core::ffi::c_int,
+        dilation_w: *mut ::core::ffi::c_int,
+        mode: *mut cuda_types::cudnn9::cudnnConvolutionMode_t,
+        computeType: *mut cuda_types::cudnn9::cudnnDataType_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetConvolutionNdDescriptor(
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        arrayLength: ::core::ffi::c_int,
+        padA: *const ::core::ffi::c_int,
+        filterStrideA: *const ::core::ffi::c_int,
+        dilationA: *const ::core::ffi::c_int,
+        mode: cuda_types::cudnn9::cudnnConvolutionMode_t,
+        computeType: cuda_types::cudnn9::cudnnDataType_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionNdDescriptor(
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        arrayLengthRequested: ::core::ffi::c_int,
+        arrayLength: *mut ::core::ffi::c_int,
+        padA: *mut ::core::ffi::c_int,
+        strideA: *mut ::core::ffi::c_int,
+        dilationA: *mut ::core::ffi::c_int,
+        mode: *mut cuda_types::cudnn9::cudnnConvolutionMode_t,
+        computeType: *mut cuda_types::cudnn9::cudnnDataType_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolution2dForwardOutputDim(
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        inputTensorDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        filterDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        n: *mut ::core::ffi::c_int,
+        c: *mut ::core::ffi::c_int,
+        h: *mut ::core::ffi::c_int,
+        w: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionNdForwardOutputDim(
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        inputTensorDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        filterDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        nbDims: ::core::ffi::c_int,
+        tensorOuputDimA: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionForwardAlgorithmMaxCount(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        count: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionForwardAlgorithm_v7(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        srcDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        filterDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        destDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        requestedAlgoCount: ::core::ffi::c_int,
+        returnedAlgoCount: *mut ::core::ffi::c_int,
+        perfResults: *mut cuda_types::cudnn9::cudnnConvolutionFwdAlgoPerf_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnFindConvolutionForwardAlgorithm(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        wDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        requestedAlgoCount: ::core::ffi::c_int,
+        returnedAlgoCount: *mut ::core::ffi::c_int,
+        perfResults: *mut cuda_types::cudnn9::cudnnConvolutionFwdAlgoPerf_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnFindConvolutionForwardAlgorithmEx(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        wDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        w: *const ::core::ffi::c_void,
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+        requestedAlgoCount: ::core::ffi::c_int,
+        returnedAlgoCount: *mut ::core::ffi::c_int,
+        perfResults: *mut cuda_types::cudnn9::cudnnConvolutionFwdAlgoPerf_t,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnIm2Col(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        wDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        colBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnReorderFilterAndBias(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        filterDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        reorderType: cuda_types::cudnn9::cudnnReorderType_t,
+        filterData: *const ::core::ffi::c_void,
+        reorderedFilterData: *mut ::core::ffi::c_void,
+        reorderBias: ::core::ffi::c_int,
+        biasData: *const ::core::ffi::c_void,
+        reorderedBiasData: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionForwardWorkspaceSize(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        wDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        algo: cuda_types::cudnn9::cudnnConvolutionFwdAlgo_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnConvolutionForward(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        alpha: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        wDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        w: *const ::core::ffi::c_void,
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        algo: cuda_types::cudnn9::cudnnConvolutionFwdAlgo_t,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        beta: *const ::core::ffi::c_void,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnConvolutionBiasActivationForward(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        alpha1: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        wDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        w: *const ::core::ffi::c_void,
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        algo: cuda_types::cudnn9::cudnnConvolutionFwdAlgo_t,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        alpha2: *const ::core::ffi::c_void,
+        zDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        z: *const ::core::ffi::c_void,
+        biasDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        bias: *const ::core::ffi::c_void,
+        activationDesc: cuda_types::cudnn9::cudnnActivationDescriptor_t,
+        yDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        y: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        count: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnFindConvolutionBackwardDataAlgorithm(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        wDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        dyDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        dxDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        requestedAlgoCount: ::core::ffi::c_int,
+        returnedAlgoCount: *mut ::core::ffi::c_int,
+        perfResults: *mut cuda_types::cudnn9::cudnnConvolutionBwdDataAlgoPerf_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnFindConvolutionBackwardDataAlgorithmEx(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        wDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        w: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        dxDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+        requestedAlgoCount: ::core::ffi::c_int,
+        returnedAlgoCount: *mut ::core::ffi::c_int,
+        perfResults: *mut cuda_types::cudnn9::cudnnConvolutionBwdDataAlgoPerf_t,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionBackwardDataAlgorithm_v7(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        filterDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        diffDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        gradDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        requestedAlgoCount: ::core::ffi::c_int,
+        returnedAlgoCount: *mut ::core::ffi::c_int,
+        perfResults: *mut cuda_types::cudnn9::cudnnConvolutionBwdDataAlgoPerf_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionBackwardDataWorkspaceSize(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        wDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        dyDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        dxDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        algo: cuda_types::cudnn9::cudnnConvolutionBwdDataAlgo_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnConvolutionBackwardData(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        alpha: *const ::core::ffi::c_void,
+        wDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        w: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        algo: cuda_types::cudnn9::cudnnConvolutionBwdDataAlgo_t,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        beta: *const ::core::ffi::c_void,
+        dxDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dx: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetFoldedConvBackwardDataDescriptors(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        filterDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        diffDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        gradDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        transformFormat: cuda_types::cudnn9::cudnnTensorFormat_t,
+        foldedFilterDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        paddedDiffDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        foldedConvDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        foldedGradDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        filterFoldTransDesc: cuda_types::cudnn9::cudnnTensorTransformDescriptor_t,
+        diffPadTransDesc: cuda_types::cudnn9::cudnnTensorTransformDescriptor_t,
+        gradFoldTransDesc: cuda_types::cudnn9::cudnnTensorTransformDescriptor_t,
+        gradUnfoldTransDesc: cuda_types::cudnn9::cudnnTensorTransformDescriptor_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCnnVersionCheck() -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        count: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnFindConvolutionBackwardFilterAlgorithm(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dyDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        dwDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        requestedAlgoCount: ::core::ffi::c_int,
+        returnedAlgoCount: *mut ::core::ffi::c_int,
+        perfResults: *mut cuda_types::cudnn9::cudnnConvolutionBwdFilterAlgoPerf_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnFindConvolutionBackwardFilterAlgorithmEx(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        y: *const ::core::ffi::c_void,
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        dwDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        dw: *mut ::core::ffi::c_void,
+        requestedAlgoCount: ::core::ffi::c_int,
+        returnedAlgoCount: *mut ::core::ffi::c_int,
+        perfResults: *mut cuda_types::cudnn9::cudnnConvolutionBwdFilterAlgoPerf_t,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionBackwardFilterAlgorithm_v7(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        srcDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        diffDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        gradDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        requestedAlgoCount: ::core::ffi::c_int,
+        returnedAlgoCount: *mut ::core::ffi::c_int,
+        perfResults: *mut cuda_types::cudnn9::cudnnConvolutionBwdFilterAlgoPerf_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetConvolutionBackwardFilterWorkspaceSize(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dyDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        gradDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        algo: cuda_types::cudnn9::cudnnConvolutionBwdFilterAlgo_t,
+        sizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnConvolutionBackwardFilter(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        alpha: *const ::core::ffi::c_void,
+        xDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        x: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        convDesc: cuda_types::cudnn9::cudnnConvolutionDescriptor_t,
+        algo: cuda_types::cudnn9::cudnnConvolutionBwdFilterAlgo_t,
+        workSpace: *mut ::core::ffi::c_void,
+        workSpaceSizeInBytes: usize,
+        beta: *const ::core::ffi::c_void,
+        dwDesc: cuda_types::cudnn9::cudnnFilterDescriptor_t,
+        dw: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnConvolutionBackwardBias(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        alpha: *const ::core::ffi::c_void,
+        dyDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        dy: *const ::core::ffi::c_void,
+        beta: *const ::core::ffi::c_void,
+        dbDesc: cuda_types::cudnn9::cudnnTensorDescriptor_t,
+        db: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateFusedOpsConstParamPack(
+        constPack: *mut cuda_types::cudnn9::cudnnFusedOpsConstParamPack_t,
+        ops: cuda_types::cudnn9::cudnnFusedOps_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyFusedOpsConstParamPack(
+        constPack: cuda_types::cudnn9::cudnnFusedOpsConstParamPack_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetFusedOpsConstParamPackAttribute(
+        constPack: cuda_types::cudnn9::cudnnFusedOpsConstParamPack_t,
+        paramLabel: cuda_types::cudnn9::cudnnFusedOpsConstParamLabel_t,
+        param: *const ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetFusedOpsConstParamPackAttribute(
+        constPack: cuda_types::cudnn9::cudnnFusedOpsConstParamPack_t,
+        paramLabel: cuda_types::cudnn9::cudnnFusedOpsConstParamLabel_t,
+        param: *mut ::core::ffi::c_void,
+        isNULL: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateFusedOpsVariantParamPack(
+        varPack: *mut cuda_types::cudnn9::cudnnFusedOpsVariantParamPack_t,
+        ops: cuda_types::cudnn9::cudnnFusedOps_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyFusedOpsVariantParamPack(
+        varPack: cuda_types::cudnn9::cudnnFusedOpsVariantParamPack_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnSetFusedOpsVariantParamPackAttribute(
+        varPack: cuda_types::cudnn9::cudnnFusedOpsVariantParamPack_t,
+        paramLabel: cuda_types::cudnn9::cudnnFusedOpsVariantParamLabel_t,
+        ptr: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnGetFusedOpsVariantParamPackAttribute(
+        varPack: cuda_types::cudnn9::cudnnFusedOpsVariantParamPack_t,
+        paramLabel: cuda_types::cudnn9::cudnnFusedOpsVariantParamLabel_t,
+        ptr: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnCreateFusedOpsPlan(
+        plan: *mut cuda_types::cudnn9::cudnnFusedOpsPlan_t,
+        ops: cuda_types::cudnn9::cudnnFusedOps_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnDestroyFusedOpsPlan(
+        plan: cuda_types::cudnn9::cudnnFusedOpsPlan_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnMakeFusedOpsPlan(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        plan: cuda_types::cudnn9::cudnnFusedOpsPlan_t,
+        constPack: cuda_types::cudnn9::cudnnFusedOpsConstParamPack_t,
+        workspaceSizeInBytes: *mut usize,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+    #[must_use]
+    fn cudnnFusedOpsExecute(
+        handle: cuda_types::cudnn9::cudnnHandle_t,
+        plan: cuda_types::cudnn9::cudnnFusedOpsPlan_t,
+        varPack: cuda_types::cudnn9::cudnnFusedOpsVariantParamPack_t,
+    ) -> cuda_types::cudnn9::cudnnStatus_t;
+}
diff --git a/cuda_base/src/cufft.rs b/cuda_base/src/cufft.rs
new file mode 100644
index 0000000..894b2ea
--- /dev/null
+++ b/cuda_base/src/cufft.rs
@@ -0,0 +1,368 @@
+// Generated automatically by zluda_bindgen
+// DO NOT EDIT MANUALLY
+#![allow(warnings)]
+extern "system" {
+    fn cufftPlan1d(
+        plan: *mut cuda_types::cufft::cufftHandle,
+        nx: ::core::ffi::c_int,
+        type_: cuda_types::cufft::cufftType,
+        batch: ::core::ffi::c_int,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftPlan2d(
+        plan: *mut cuda_types::cufft::cufftHandle,
+        nx: ::core::ffi::c_int,
+        ny: ::core::ffi::c_int,
+        type_: cuda_types::cufft::cufftType,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftPlan3d(
+        plan: *mut cuda_types::cufft::cufftHandle,
+        nx: ::core::ffi::c_int,
+        ny: ::core::ffi::c_int,
+        nz: ::core::ffi::c_int,
+        type_: cuda_types::cufft::cufftType,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftPlanMany(
+        plan: *mut cuda_types::cufft::cufftHandle,
+        rank: ::core::ffi::c_int,
+        n: *mut ::core::ffi::c_int,
+        inembed: *mut ::core::ffi::c_int,
+        istride: ::core::ffi::c_int,
+        idist: ::core::ffi::c_int,
+        onembed: *mut ::core::ffi::c_int,
+        ostride: ::core::ffi::c_int,
+        odist: ::core::ffi::c_int,
+        type_: cuda_types::cufft::cufftType,
+        batch: ::core::ffi::c_int,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftMakePlan1d(
+        plan: cuda_types::cufft::cufftHandle,
+        nx: ::core::ffi::c_int,
+        type_: cuda_types::cufft::cufftType,
+        batch: ::core::ffi::c_int,
+        workSize: *mut usize,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftMakePlan2d(
+        plan: cuda_types::cufft::cufftHandle,
+        nx: ::core::ffi::c_int,
+        ny: ::core::ffi::c_int,
+        type_: cuda_types::cufft::cufftType,
+        workSize: *mut usize,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftMakePlan3d(
+        plan: cuda_types::cufft::cufftHandle,
+        nx: ::core::ffi::c_int,
+        ny: ::core::ffi::c_int,
+        nz: ::core::ffi::c_int,
+        type_: cuda_types::cufft::cufftType,
+        workSize: *mut usize,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftMakePlanMany(
+        plan: cuda_types::cufft::cufftHandle,
+        rank: ::core::ffi::c_int,
+        n: *mut ::core::ffi::c_int,
+        inembed: *mut ::core::ffi::c_int,
+        istride: ::core::ffi::c_int,
+        idist: ::core::ffi::c_int,
+        onembed: *mut ::core::ffi::c_int,
+        ostride: ::core::ffi::c_int,
+        odist: ::core::ffi::c_int,
+        type_: cuda_types::cufft::cufftType,
+        batch: ::core::ffi::c_int,
+        workSize: *mut usize,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftMakePlanMany64(
+        plan: cuda_types::cufft::cufftHandle,
+        rank: ::core::ffi::c_int,
+        n: *mut ::core::ffi::c_longlong,
+        inembed: *mut ::core::ffi::c_longlong,
+        istride: ::core::ffi::c_longlong,
+        idist: ::core::ffi::c_longlong,
+        onembed: *mut ::core::ffi::c_longlong,
+        ostride: ::core::ffi::c_longlong,
+        odist: ::core::ffi::c_longlong,
+        type_: cuda_types::cufft::cufftType,
+        batch: ::core::ffi::c_longlong,
+        workSize: *mut usize,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftGetSizeMany64(
+        plan: cuda_types::cufft::cufftHandle,
+        rank: ::core::ffi::c_int,
+        n: *mut ::core::ffi::c_longlong,
+        inembed: *mut ::core::ffi::c_longlong,
+        istride: ::core::ffi::c_longlong,
+        idist: ::core::ffi::c_longlong,
+        onembed: *mut ::core::ffi::c_longlong,
+        ostride: ::core::ffi::c_longlong,
+        odist: ::core::ffi::c_longlong,
+        type_: cuda_types::cufft::cufftType,
+        batch: ::core::ffi::c_longlong,
+        workSize: *mut usize,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftEstimate1d(
+        nx: ::core::ffi::c_int,
+        type_: cuda_types::cufft::cufftType,
+        batch: ::core::ffi::c_int,
+        workSize: *mut usize,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftEstimate2d(
+        nx: ::core::ffi::c_int,
+        ny: ::core::ffi::c_int,
+        type_: cuda_types::cufft::cufftType,
+        workSize: *mut usize,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftEstimate3d(
+        nx: ::core::ffi::c_int,
+        ny: ::core::ffi::c_int,
+        nz: ::core::ffi::c_int,
+        type_: cuda_types::cufft::cufftType,
+        workSize: *mut usize,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftEstimateMany(
+        rank: ::core::ffi::c_int,
+        n: *mut ::core::ffi::c_int,
+        inembed: *mut ::core::ffi::c_int,
+        istride: ::core::ffi::c_int,
+        idist: ::core::ffi::c_int,
+        onembed: *mut ::core::ffi::c_int,
+        ostride: ::core::ffi::c_int,
+        odist: ::core::ffi::c_int,
+        type_: cuda_types::cufft::cufftType,
+        batch: ::core::ffi::c_int,
+        workSize: *mut usize,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftCreate(
+        handle: *mut cuda_types::cufft::cufftHandle,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftGetSize1d(
+        handle: cuda_types::cufft::cufftHandle,
+        nx: ::core::ffi::c_int,
+        type_: cuda_types::cufft::cufftType,
+        batch: ::core::ffi::c_int,
+        workSize: *mut usize,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftGetSize2d(
+        handle: cuda_types::cufft::cufftHandle,
+        nx: ::core::ffi::c_int,
+        ny: ::core::ffi::c_int,
+        type_: cuda_types::cufft::cufftType,
+        workSize: *mut usize,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftGetSize3d(
+        handle: cuda_types::cufft::cufftHandle,
+        nx: ::core::ffi::c_int,
+        ny: ::core::ffi::c_int,
+        nz: ::core::ffi::c_int,
+        type_: cuda_types::cufft::cufftType,
+        workSize: *mut usize,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftGetSizeMany(
+        handle: cuda_types::cufft::cufftHandle,
+        rank: ::core::ffi::c_int,
+        n: *mut ::core::ffi::c_int,
+        inembed: *mut ::core::ffi::c_int,
+        istride: ::core::ffi::c_int,
+        idist: ::core::ffi::c_int,
+        onembed: *mut ::core::ffi::c_int,
+        ostride: ::core::ffi::c_int,
+        odist: ::core::ffi::c_int,
+        type_: cuda_types::cufft::cufftType,
+        batch: ::core::ffi::c_int,
+        workArea: *mut usize,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftGetSize(
+        handle: cuda_types::cufft::cufftHandle,
+        workSize: *mut usize,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftSetWorkArea(
+        plan: cuda_types::cufft::cufftHandle,
+        workArea: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftSetAutoAllocation(
+        plan: cuda_types::cufft::cufftHandle,
+        autoAllocate: ::core::ffi::c_int,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftExecC2C(
+        plan: cuda_types::cufft::cufftHandle,
+        idata: *mut cuda_types::cufft::cufftComplex,
+        odata: *mut cuda_types::cufft::cufftComplex,
+        direction: ::core::ffi::c_int,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftExecR2C(
+        plan: cuda_types::cufft::cufftHandle,
+        idata: *mut cuda_types::cufft::cufftReal,
+        odata: *mut cuda_types::cufft::cufftComplex,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftExecC2R(
+        plan: cuda_types::cufft::cufftHandle,
+        idata: *mut cuda_types::cufft::cufftComplex,
+        odata: *mut cuda_types::cufft::cufftReal,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftExecZ2Z(
+        plan: cuda_types::cufft::cufftHandle,
+        idata: *mut cuda_types::cufft::cufftDoubleComplex,
+        odata: *mut cuda_types::cufft::cufftDoubleComplex,
+        direction: ::core::ffi::c_int,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftExecD2Z(
+        plan: cuda_types::cufft::cufftHandle,
+        idata: *mut cuda_types::cufft::cufftDoubleReal,
+        odata: *mut cuda_types::cufft::cufftDoubleComplex,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftExecZ2D(
+        plan: cuda_types::cufft::cufftHandle,
+        idata: *mut cuda_types::cufft::cufftDoubleComplex,
+        odata: *mut cuda_types::cufft::cufftDoubleReal,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftSetStream(
+        plan: cuda_types::cufft::cufftHandle,
+        stream: cuda_types::cufft::cudaStream_t,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftDestroy(
+        plan: cuda_types::cufft::cufftHandle,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftGetVersion(
+        version: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftGetProperty(
+        type_: cuda_types::cufft::libraryPropertyType,
+        value: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftSetPlanPropertyInt64(
+        plan: cuda_types::cufft::cufftHandle,
+        property: cuda_types::cufft::cufftProperty,
+        inputValueInt: ::core::ffi::c_longlong,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftGetPlanPropertyInt64(
+        plan: cuda_types::cufft::cufftHandle,
+        property: cuda_types::cufft::cufftProperty,
+        returnPtrValue: *mut ::core::ffi::c_longlong,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftResetPlanProperty(
+        plan: cuda_types::cufft::cufftHandle,
+        property: cuda_types::cufft::cufftProperty,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftXtSetGPUs(
+        handle: cuda_types::cufft::cufftHandle,
+        nGPUs: ::core::ffi::c_int,
+        whichGPUs: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftXtMalloc(
+        plan: cuda_types::cufft::cufftHandle,
+        descriptor: *mut *mut cuda_types::cufft::cudaLibXtDesc,
+        format: cuda_types::cufft::cufftXtSubFormat,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftXtMemcpy(
+        plan: cuda_types::cufft::cufftHandle,
+        dstPointer: *mut ::core::ffi::c_void,
+        srcPointer: *mut ::core::ffi::c_void,
+        type_: cuda_types::cufft::cufftXtCopyType,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftXtFree(
+        descriptor: *mut cuda_types::cufft::cudaLibXtDesc,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftXtSetWorkArea(
+        plan: cuda_types::cufft::cufftHandle,
+        workArea: *mut *mut ::core::ffi::c_void,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftXtExecDescriptorC2C(
+        plan: cuda_types::cufft::cufftHandle,
+        input: *mut cuda_types::cufft::cudaLibXtDesc,
+        output: *mut cuda_types::cufft::cudaLibXtDesc,
+        direction: ::core::ffi::c_int,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftXtExecDescriptorR2C(
+        plan: cuda_types::cufft::cufftHandle,
+        input: *mut cuda_types::cufft::cudaLibXtDesc,
+        output: *mut cuda_types::cufft::cudaLibXtDesc,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftXtExecDescriptorC2R(
+        plan: cuda_types::cufft::cufftHandle,
+        input: *mut cuda_types::cufft::cudaLibXtDesc,
+        output: *mut cuda_types::cufft::cudaLibXtDesc,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftXtExecDescriptorZ2Z(
+        plan: cuda_types::cufft::cufftHandle,
+        input: *mut cuda_types::cufft::cudaLibXtDesc,
+        output: *mut cuda_types::cufft::cudaLibXtDesc,
+        direction: ::core::ffi::c_int,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftXtExecDescriptorD2Z(
+        plan: cuda_types::cufft::cufftHandle,
+        input: *mut cuda_types::cufft::cudaLibXtDesc,
+        output: *mut cuda_types::cufft::cudaLibXtDesc,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftXtExecDescriptorZ2D(
+        plan: cuda_types::cufft::cufftHandle,
+        input: *mut cuda_types::cufft::cudaLibXtDesc,
+        output: *mut cuda_types::cufft::cudaLibXtDesc,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftXtQueryPlan(
+        plan: cuda_types::cufft::cufftHandle,
+        queryStruct: *mut ::core::ffi::c_void,
+        queryType: cuda_types::cufft::cufftXtQueryType,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftXtSetCallback(
+        plan: cuda_types::cufft::cufftHandle,
+        callback_routine: *mut *mut ::core::ffi::c_void,
+        cbType: cuda_types::cufft::cufftXtCallbackType,
+        caller_info: *mut *mut ::core::ffi::c_void,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftXtClearCallback(
+        plan: cuda_types::cufft::cufftHandle,
+        cbType: cuda_types::cufft::cufftXtCallbackType,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftXtSetCallbackSharedSize(
+        plan: cuda_types::cufft::cufftHandle,
+        cbType: cuda_types::cufft::cufftXtCallbackType,
+        sharedSize: usize,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftXtMakePlanMany(
+        plan: cuda_types::cufft::cufftHandle,
+        rank: ::core::ffi::c_int,
+        n: *mut ::core::ffi::c_longlong,
+        inembed: *mut ::core::ffi::c_longlong,
+        istride: ::core::ffi::c_longlong,
+        idist: ::core::ffi::c_longlong,
+        inputtype: cuda_types::cufft::cudaDataType,
+        onembed: *mut ::core::ffi::c_longlong,
+        ostride: ::core::ffi::c_longlong,
+        odist: ::core::ffi::c_longlong,
+        outputtype: cuda_types::cufft::cudaDataType,
+        batch: ::core::ffi::c_longlong,
+        workSize: *mut usize,
+        executiontype: cuda_types::cufft::cudaDataType,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftXtGetSizeMany(
+        plan: cuda_types::cufft::cufftHandle,
+        rank: ::core::ffi::c_int,
+        n: *mut ::core::ffi::c_longlong,
+        inembed: *mut ::core::ffi::c_longlong,
+        istride: ::core::ffi::c_longlong,
+        idist: ::core::ffi::c_longlong,
+        inputtype: cuda_types::cufft::cudaDataType,
+        onembed: *mut ::core::ffi::c_longlong,
+        ostride: ::core::ffi::c_longlong,
+        odist: ::core::ffi::c_longlong,
+        outputtype: cuda_types::cufft::cudaDataType,
+        batch: ::core::ffi::c_longlong,
+        workSize: *mut usize,
+        executiontype: cuda_types::cufft::cudaDataType,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftXtExec(
+        plan: cuda_types::cufft::cufftHandle,
+        input: *mut ::core::ffi::c_void,
+        output: *mut ::core::ffi::c_void,
+        direction: ::core::ffi::c_int,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftXtExecDescriptor(
+        plan: cuda_types::cufft::cufftHandle,
+        input: *mut cuda_types::cufft::cudaLibXtDesc,
+        output: *mut cuda_types::cufft::cudaLibXtDesc,
+        direction: ::core::ffi::c_int,
+    ) -> cuda_types::cufft::cufftResult;
+    fn cufftXtSetWorkAreaPolicy(
+        plan: cuda_types::cufft::cufftHandle,
+        policy: cuda_types::cufft::cufftXtWorkAreaPolicy,
+        workSize: *mut usize,
+    ) -> cuda_types::cufft::cufftResult;
+}
diff --git a/cuda_base/src/cusparse.rs b/cuda_base/src/cusparse.rs
new file mode 100644
index 0000000..31f9873
--- /dev/null
+++ b/cuda_base/src/cusparse.rs
@@ -0,0 +1,5518 @@
+// Generated automatically by zluda_bindgen
+// DO NOT EDIT MANUALLY
+#![allow(warnings)]
+extern "system" {
+    #[must_use]
+    fn cusparseCreate(
+        handle: *mut cuda_types::cusparse::cusparseHandle_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDestroy(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseGetVersion(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        version: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseGetProperty(
+        type_: cuda_types::cusparse::libraryPropertyType,
+        value: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    fn cusparseGetErrorName(
+        status: cuda_types::cusparse::cusparseStatus_t,
+    ) -> *const ::core::ffi::c_char;
+    fn cusparseGetErrorString(
+        status: cuda_types::cusparse::cusparseStatus_t,
+    ) -> *const ::core::ffi::c_char;
+    #[must_use]
+    fn cusparseSetStream(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        streamId: cuda_types::cusparse::cudaStream_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseGetStream(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        streamId: *mut cuda_types::cusparse::cudaStream_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseGetPointerMode(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        mode: *mut cuda_types::cusparse::cusparsePointerMode_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSetPointerMode(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        mode: cuda_types::cusparse::cusparsePointerMode_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseLoggerSetCallback(
+        callback: cuda_types::cusparse::cusparseLoggerCallback_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseLoggerSetFile(file: *mut FILE) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseLoggerOpenFile(
+        logFile: *const ::core::ffi::c_char,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseLoggerSetLevel(
+        level: ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseLoggerSetMask(
+        mask: ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseLoggerForceDisable() -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateMatDescr(
+        descrA: *mut cuda_types::cusparse::cusparseMatDescr_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDestroyMatDescr(
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSetMatType(
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        type_: cuda_types::cusparse::cusparseMatrixType_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    fn cusparseGetMatType(
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+    ) -> cuda_types::cusparse::cusparseMatrixType_t;
+    #[must_use]
+    fn cusparseSetMatFillMode(
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        fillMode: cuda_types::cusparse::cusparseFillMode_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    fn cusparseGetMatFillMode(
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+    ) -> cuda_types::cusparse::cusparseFillMode_t;
+    #[must_use]
+    fn cusparseSetMatDiagType(
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        diagType: cuda_types::cusparse::cusparseDiagType_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    fn cusparseGetMatDiagType(
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+    ) -> cuda_types::cusparse::cusparseDiagType_t;
+    #[must_use]
+    fn cusparseSetMatIndexBase(
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        base: cuda_types::cusparse::cusparseIndexBase_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    fn cusparseGetMatIndexBase(
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+    ) -> cuda_types::cusparse::cusparseIndexBase_t;
+    #[must_use]
+    fn cusparseCreateCsric02Info(
+        info: *mut cuda_types::cusparse::csric02Info_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDestroyCsric02Info(
+        info: cuda_types::cusparse::csric02Info_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateBsric02Info(
+        info: *mut cuda_types::cusparse::bsric02Info_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDestroyBsric02Info(
+        info: cuda_types::cusparse::bsric02Info_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateCsrilu02Info(
+        info: *mut cuda_types::cusparse::csrilu02Info_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDestroyCsrilu02Info(
+        info: cuda_types::cusparse::csrilu02Info_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateBsrilu02Info(
+        info: *mut cuda_types::cusparse::bsrilu02Info_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDestroyBsrilu02Info(
+        info: cuda_types::cusparse::bsrilu02Info_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateBsrsv2Info(
+        info: *mut cuda_types::cusparse::bsrsv2Info_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDestroyBsrsv2Info(
+        info: cuda_types::cusparse::bsrsv2Info_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateBsrsm2Info(
+        info: *mut cuda_types::cusparse::bsrsm2Info_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDestroyBsrsm2Info(
+        info: cuda_types::cusparse::bsrsm2Info_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateCsru2csrInfo(
+        info: *mut cuda_types::cusparse::csru2csrInfo_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDestroyCsru2csrInfo(
+        info: cuda_types::cusparse::csru2csrInfo_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateColorInfo(
+        info: *mut cuda_types::cusparse::cusparseColorInfo_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDestroyColorInfo(
+        info: cuda_types::cusparse::cusparseColorInfo_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreatePruneInfo(
+        info: *mut cuda_types::cusparse::pruneInfo_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDestroyPruneInfo(
+        info: cuda_types::cusparse::pruneInfo_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSgemvi(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        xVal: *const f32,
+        xInd: *const ::core::ffi::c_int,
+        beta: *const f32,
+        y: *mut f32,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSgemvi_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        pBufferSize: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDgemvi(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f64,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        xVal: *const f64,
+        xInd: *const ::core::ffi::c_int,
+        beta: *const f64,
+        y: *mut f64,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDgemvi_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        pBufferSize: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCgemvi(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cusparse::cuComplex,
+        A: *const cuda_types::cusparse::cuComplex,
+        lda: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        xVal: *const cuda_types::cusparse::cuComplex,
+        xInd: *const ::core::ffi::c_int,
+        beta: *const cuda_types::cusparse::cuComplex,
+        y: *mut cuda_types::cusparse::cuComplex,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCgemvi_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        pBufferSize: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZgemvi(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cusparse::cuDoubleComplex,
+        A: *const cuda_types::cusparse::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        xVal: *const cuda_types::cusparse::cuDoubleComplex,
+        xInd: *const ::core::ffi::c_int,
+        beta: *const cuda_types::cusparse::cuDoubleComplex,
+        y: *mut cuda_types::cusparse::cuDoubleComplex,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZgemvi_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        pBufferSize: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSbsrmv(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        alpha: *const f32,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const f32,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        x: *const f32,
+        beta: *const f32,
+        y: *mut f32,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDbsrmv(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        alpha: *const f64,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const f64,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        x: *const f64,
+        beta: *const f64,
+        y: *mut f64,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCbsrmv(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        alpha: *const cuda_types::cusparse::cuComplex,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        x: *const cuda_types::cusparse::cuComplex,
+        beta: *const cuda_types::cusparse::cuComplex,
+        y: *mut cuda_types::cusparse::cuComplex,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZbsrmv(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        alpha: *const cuda_types::cusparse::cuDoubleComplex,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        x: *const cuda_types::cusparse::cuDoubleComplex,
+        beta: *const cuda_types::cusparse::cuDoubleComplex,
+        y: *mut cuda_types::cusparse::cuDoubleComplex,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSbsrxmv(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        sizeOfMask: ::core::ffi::c_int,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        alpha: *const f32,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const f32,
+        bsrSortedMaskPtrA: *const ::core::ffi::c_int,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedEndPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        x: *const f32,
+        beta: *const f32,
+        y: *mut f32,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDbsrxmv(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        sizeOfMask: ::core::ffi::c_int,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        alpha: *const f64,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const f64,
+        bsrSortedMaskPtrA: *const ::core::ffi::c_int,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedEndPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        x: *const f64,
+        beta: *const f64,
+        y: *mut f64,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCbsrxmv(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        sizeOfMask: ::core::ffi::c_int,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        alpha: *const cuda_types::cusparse::cuComplex,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const cuda_types::cusparse::cuComplex,
+        bsrSortedMaskPtrA: *const ::core::ffi::c_int,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedEndPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        x: *const cuda_types::cusparse::cuComplex,
+        beta: *const cuda_types::cusparse::cuComplex,
+        y: *mut cuda_types::cusparse::cuComplex,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZbsrxmv(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        sizeOfMask: ::core::ffi::c_int,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        alpha: *const cuda_types::cusparse::cuDoubleComplex,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedMaskPtrA: *const ::core::ffi::c_int,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedEndPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        x: *const cuda_types::cusparse::cuDoubleComplex,
+        beta: *const cuda_types::cusparse::cuDoubleComplex,
+        y: *mut cuda_types::cusparse::cuDoubleComplex,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseXbsrsv2_zeroPivot(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        info: cuda_types::cusparse::bsrsv2Info_t,
+        position: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSbsrsv2_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *mut f32,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsv2Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDbsrsv2_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *mut f64,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsv2Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCbsrsv2_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *mut cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsv2Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZbsrsv2_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *mut cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsv2Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSbsrsv2_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *mut f32,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsv2Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDbsrsv2_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *mut f64,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsv2Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCbsrsv2_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *mut cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsv2Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZbsrsv2_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *mut cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsv2Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSbsrsv2_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const f32,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsv2Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDbsrsv2_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const f64,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsv2Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCbsrsv2_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsv2Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZbsrsv2_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsv2Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSbsrsv2_solve(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        alpha: *const f32,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const f32,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsv2Info_t,
+        f: *const f32,
+        x: *mut f32,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDbsrsv2_solve(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        alpha: *const f64,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const f64,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsv2Info_t,
+        f: *const f64,
+        x: *mut f64,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCbsrsv2_solve(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        alpha: *const cuda_types::cusparse::cuComplex,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsv2Info_t,
+        f: *const cuda_types::cusparse::cuComplex,
+        x: *mut cuda_types::cusparse::cuComplex,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZbsrsv2_solve(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        alpha: *const cuda_types::cusparse::cuDoubleComplex,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsv2Info_t,
+        f: *const cuda_types::cusparse::cuDoubleComplex,
+        x: *mut cuda_types::cusparse::cuDoubleComplex,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSbsrmm(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        transB: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        kb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        alpha: *const f32,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const f32,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        B: *const f32,
+        ldb: ::core::ffi::c_int,
+        beta: *const f32,
+        C: *mut f32,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDbsrmm(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        transB: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        kb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        alpha: *const f64,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const f64,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        B: *const f64,
+        ldb: ::core::ffi::c_int,
+        beta: *const f64,
+        C: *mut f64,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCbsrmm(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        transB: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        kb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        alpha: *const cuda_types::cusparse::cuComplex,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        B: *const cuda_types::cusparse::cuComplex,
+        ldb: ::core::ffi::c_int,
+        beta: *const cuda_types::cusparse::cuComplex,
+        C: *mut cuda_types::cusparse::cuComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZbsrmm(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        transB: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        kb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        alpha: *const cuda_types::cusparse::cuDoubleComplex,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        B: *const cuda_types::cusparse::cuDoubleComplex,
+        ldb: ::core::ffi::c_int,
+        beta: *const cuda_types::cusparse::cuDoubleComplex,
+        C: *mut cuda_types::cusparse::cuDoubleComplex,
+        ldc: ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseXbsrsm2_zeroPivot(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        info: cuda_types::cusparse::bsrsm2Info_t,
+        position: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSbsrsm2_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        transXY: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut f32,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsm2Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDbsrsm2_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        transXY: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut f64,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsm2Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCbsrsm2_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        transXY: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsm2Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZbsrsm2_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        transXY: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsm2Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSbsrsm2_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        transB: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut f32,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsm2Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDbsrsm2_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        transB: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut f64,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsm2Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCbsrsm2_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        transB: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsm2Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZbsrsm2_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        transB: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsm2Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSbsrsm2_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        transXY: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *const f32,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsm2Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDbsrsm2_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        transXY: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *const f64,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsm2Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCbsrsm2_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        transXY: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *const cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsm2Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZbsrsm2_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        transXY: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *const cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsm2Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSbsrsm2_solve(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        transXY: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        alpha: *const f32,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *const f32,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsm2Info_t,
+        B: *const f32,
+        ldb: ::core::ffi::c_int,
+        X: *mut f32,
+        ldx: ::core::ffi::c_int,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDbsrsm2_solve(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        transXY: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        alpha: *const f64,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *const f64,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsm2Info_t,
+        B: *const f64,
+        ldb: ::core::ffi::c_int,
+        X: *mut f64,
+        ldx: ::core::ffi::c_int,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCbsrsm2_solve(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        transXY: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        alpha: *const cuda_types::cusparse::cuComplex,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *const cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsm2Info_t,
+        B: *const cuda_types::cusparse::cuComplex,
+        ldb: ::core::ffi::c_int,
+        X: *mut cuda_types::cusparse::cuComplex,
+        ldx: ::core::ffi::c_int,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZbsrsm2_solve(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        transA: cuda_types::cusparse::cusparseOperation_t,
+        transXY: cuda_types::cusparse::cusparseOperation_t,
+        mb: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        alpha: *const cuda_types::cusparse::cuDoubleComplex,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *const cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrsm2Info_t,
+        B: *const cuda_types::cusparse::cuDoubleComplex,
+        ldb: ::core::ffi::c_int,
+        X: *mut cuda_types::cusparse::cuDoubleComplex,
+        ldx: ::core::ffi::c_int,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseScsrilu02_numericBoost(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        info: cuda_types::cusparse::csrilu02Info_t,
+        enable_boost: ::core::ffi::c_int,
+        tol: *mut f64,
+        boost_val: *mut f32,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDcsrilu02_numericBoost(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        info: cuda_types::cusparse::csrilu02Info_t,
+        enable_boost: ::core::ffi::c_int,
+        tol: *mut f64,
+        boost_val: *mut f64,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCcsrilu02_numericBoost(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        info: cuda_types::cusparse::csrilu02Info_t,
+        enable_boost: ::core::ffi::c_int,
+        tol: *mut f64,
+        boost_val: *mut cuda_types::cusparse::cuComplex,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZcsrilu02_numericBoost(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        info: cuda_types::cusparse::csrilu02Info_t,
+        enable_boost: ::core::ffi::c_int,
+        tol: *mut f64,
+        boost_val: *mut cuda_types::cusparse::cuDoubleComplex,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseXcsrilu02_zeroPivot(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        info: cuda_types::cusparse::csrilu02Info_t,
+        position: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseScsrilu02_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *mut f32,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csrilu02Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDcsrilu02_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *mut f64,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csrilu02Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCcsrilu02_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *mut cuda_types::cusparse::cuComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csrilu02Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZcsrilu02_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *mut cuda_types::cusparse::cuDoubleComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csrilu02Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseScsrilu02_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedVal: *mut f32,
+        csrSortedRowPtr: *const ::core::ffi::c_int,
+        csrSortedColInd: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csrilu02Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDcsrilu02_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedVal: *mut f64,
+        csrSortedRowPtr: *const ::core::ffi::c_int,
+        csrSortedColInd: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csrilu02Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCcsrilu02_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedVal: *mut cuda_types::cusparse::cuComplex,
+        csrSortedRowPtr: *const ::core::ffi::c_int,
+        csrSortedColInd: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csrilu02Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZcsrilu02_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedVal: *mut cuda_types::cusparse::cuDoubleComplex,
+        csrSortedRowPtr: *const ::core::ffi::c_int,
+        csrSortedColInd: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csrilu02Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseScsrilu02_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f32,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csrilu02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDcsrilu02_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f64,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csrilu02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCcsrilu02_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const cuda_types::cusparse::cuComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csrilu02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZcsrilu02_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const cuda_types::cusparse::cuDoubleComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csrilu02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseScsrilu02(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA_valM: *mut f32,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csrilu02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDcsrilu02(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA_valM: *mut f64,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csrilu02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCcsrilu02(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA_valM: *mut cuda_types::cusparse::cuComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csrilu02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZcsrilu02(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA_valM: *mut cuda_types::cusparse::cuDoubleComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csrilu02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSbsrilu02_numericBoost(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        info: cuda_types::cusparse::bsrilu02Info_t,
+        enable_boost: ::core::ffi::c_int,
+        tol: *mut f64,
+        boost_val: *mut f32,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDbsrilu02_numericBoost(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        info: cuda_types::cusparse::bsrilu02Info_t,
+        enable_boost: ::core::ffi::c_int,
+        tol: *mut f64,
+        boost_val: *mut f64,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCbsrilu02_numericBoost(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        info: cuda_types::cusparse::bsrilu02Info_t,
+        enable_boost: ::core::ffi::c_int,
+        tol: *mut f64,
+        boost_val: *mut cuda_types::cusparse::cuComplex,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZbsrilu02_numericBoost(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        info: cuda_types::cusparse::bsrilu02Info_t,
+        enable_boost: ::core::ffi::c_int,
+        tol: *mut f64,
+        boost_val: *mut cuda_types::cusparse::cuDoubleComplex,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseXbsrilu02_zeroPivot(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        info: cuda_types::cusparse::bsrilu02Info_t,
+        position: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSbsrilu02_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut f32,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrilu02Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDbsrilu02_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut f64,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrilu02Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCbsrilu02_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrilu02Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZbsrilu02_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrilu02Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSbsrilu02_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut f32,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrilu02Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDbsrilu02_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut f64,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrilu02Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCbsrilu02_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrilu02Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZbsrilu02_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrilu02Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSbsrilu02_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut f32,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrilu02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDbsrilu02_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut f64,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrilu02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCbsrilu02_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrilu02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZbsrilu02_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrilu02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSbsrilu02(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut f32,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrilu02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDbsrilu02(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut f64,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrilu02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCbsrilu02(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrilu02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZbsrilu02(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsrilu02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseXcsric02_zeroPivot(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        info: cuda_types::cusparse::csric02Info_t,
+        position: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseScsric02_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *mut f32,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csric02Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDcsric02_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *mut f64,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csric02Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCcsric02_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *mut cuda_types::cusparse::cuComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csric02Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZcsric02_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *mut cuda_types::cusparse::cuDoubleComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csric02Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseScsric02_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedVal: *mut f32,
+        csrSortedRowPtr: *const ::core::ffi::c_int,
+        csrSortedColInd: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csric02Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDcsric02_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedVal: *mut f64,
+        csrSortedRowPtr: *const ::core::ffi::c_int,
+        csrSortedColInd: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csric02Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCcsric02_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedVal: *mut cuda_types::cusparse::cuComplex,
+        csrSortedRowPtr: *const ::core::ffi::c_int,
+        csrSortedColInd: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csric02Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZcsric02_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedVal: *mut cuda_types::cusparse::cuDoubleComplex,
+        csrSortedRowPtr: *const ::core::ffi::c_int,
+        csrSortedColInd: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csric02Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseScsric02_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f32,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csric02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDcsric02_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f64,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csric02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCcsric02_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const cuda_types::cusparse::cuComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csric02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZcsric02_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const cuda_types::cusparse::cuDoubleComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csric02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseScsric02(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA_valM: *mut f32,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csric02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDcsric02(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA_valM: *mut f64,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csric02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCcsric02(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA_valM: *mut cuda_types::cusparse::cuComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csric02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZcsric02(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA_valM: *mut cuda_types::cusparse::cuDoubleComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::csric02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseXbsric02_zeroPivot(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        info: cuda_types::cusparse::bsric02Info_t,
+        position: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSbsric02_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut f32,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsric02Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDbsric02_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut f64,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsric02Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCbsric02_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsric02Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZbsric02_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsric02Info_t,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSbsric02_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut f32,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsric02Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDbsric02_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut f64,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsric02Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCbsric02_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsric02Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZbsric02_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsric02Info_t,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSbsric02_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *const f32,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsric02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pInputBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDbsric02_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *const f64,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsric02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pInputBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCbsric02_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *const cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsric02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pInputBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZbsric02_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *const cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsric02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pInputBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSbsric02(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut f32,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsric02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDbsric02(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut f64,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsric02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCbsric02(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsric02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZbsric02(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedVal: *mut cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        info: cuda_types::cusparse::bsric02Info_t,
+        policy: cuda_types::cusparse::cusparseSolvePolicy_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSgtsv2_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        dl: *const f32,
+        d: *const f32,
+        du: *const f32,
+        B: *const f32,
+        ldb: ::core::ffi::c_int,
+        bufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDgtsv2_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        dl: *const f64,
+        d: *const f64,
+        du: *const f64,
+        B: *const f64,
+        ldb: ::core::ffi::c_int,
+        bufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCgtsv2_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        dl: *const cuda_types::cusparse::cuComplex,
+        d: *const cuda_types::cusparse::cuComplex,
+        du: *const cuda_types::cusparse::cuComplex,
+        B: *const cuda_types::cusparse::cuComplex,
+        ldb: ::core::ffi::c_int,
+        bufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZgtsv2_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        dl: *const cuda_types::cusparse::cuDoubleComplex,
+        d: *const cuda_types::cusparse::cuDoubleComplex,
+        du: *const cuda_types::cusparse::cuDoubleComplex,
+        B: *const cuda_types::cusparse::cuDoubleComplex,
+        ldb: ::core::ffi::c_int,
+        bufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSgtsv2(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        dl: *const f32,
+        d: *const f32,
+        du: *const f32,
+        B: *mut f32,
+        ldb: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDgtsv2(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        dl: *const f64,
+        d: *const f64,
+        du: *const f64,
+        B: *mut f64,
+        ldb: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCgtsv2(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        dl: *const cuda_types::cusparse::cuComplex,
+        d: *const cuda_types::cusparse::cuComplex,
+        du: *const cuda_types::cusparse::cuComplex,
+        B: *mut cuda_types::cusparse::cuComplex,
+        ldb: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZgtsv2(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        dl: *const cuda_types::cusparse::cuDoubleComplex,
+        d: *const cuda_types::cusparse::cuDoubleComplex,
+        du: *const cuda_types::cusparse::cuDoubleComplex,
+        B: *mut cuda_types::cusparse::cuDoubleComplex,
+        ldb: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSgtsv2_nopivot_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        dl: *const f32,
+        d: *const f32,
+        du: *const f32,
+        B: *const f32,
+        ldb: ::core::ffi::c_int,
+        bufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDgtsv2_nopivot_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        dl: *const f64,
+        d: *const f64,
+        du: *const f64,
+        B: *const f64,
+        ldb: ::core::ffi::c_int,
+        bufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCgtsv2_nopivot_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        dl: *const cuda_types::cusparse::cuComplex,
+        d: *const cuda_types::cusparse::cuComplex,
+        du: *const cuda_types::cusparse::cuComplex,
+        B: *const cuda_types::cusparse::cuComplex,
+        ldb: ::core::ffi::c_int,
+        bufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZgtsv2_nopivot_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        dl: *const cuda_types::cusparse::cuDoubleComplex,
+        d: *const cuda_types::cusparse::cuDoubleComplex,
+        du: *const cuda_types::cusparse::cuDoubleComplex,
+        B: *const cuda_types::cusparse::cuDoubleComplex,
+        ldb: ::core::ffi::c_int,
+        bufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSgtsv2_nopivot(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        dl: *const f32,
+        d: *const f32,
+        du: *const f32,
+        B: *mut f32,
+        ldb: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDgtsv2_nopivot(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        dl: *const f64,
+        d: *const f64,
+        du: *const f64,
+        B: *mut f64,
+        ldb: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCgtsv2_nopivot(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        dl: *const cuda_types::cusparse::cuComplex,
+        d: *const cuda_types::cusparse::cuComplex,
+        du: *const cuda_types::cusparse::cuComplex,
+        B: *mut cuda_types::cusparse::cuComplex,
+        ldb: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZgtsv2_nopivot(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        dl: *const cuda_types::cusparse::cuDoubleComplex,
+        d: *const cuda_types::cusparse::cuDoubleComplex,
+        du: *const cuda_types::cusparse::cuDoubleComplex,
+        B: *mut cuda_types::cusparse::cuDoubleComplex,
+        ldb: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSgtsv2StridedBatch_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        dl: *const f32,
+        d: *const f32,
+        du: *const f32,
+        x: *const f32,
+        batchCount: ::core::ffi::c_int,
+        batchStride: ::core::ffi::c_int,
+        bufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDgtsv2StridedBatch_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        dl: *const f64,
+        d: *const f64,
+        du: *const f64,
+        x: *const f64,
+        batchCount: ::core::ffi::c_int,
+        batchStride: ::core::ffi::c_int,
+        bufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCgtsv2StridedBatch_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        dl: *const cuda_types::cusparse::cuComplex,
+        d: *const cuda_types::cusparse::cuComplex,
+        du: *const cuda_types::cusparse::cuComplex,
+        x: *const cuda_types::cusparse::cuComplex,
+        batchCount: ::core::ffi::c_int,
+        batchStride: ::core::ffi::c_int,
+        bufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZgtsv2StridedBatch_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        dl: *const cuda_types::cusparse::cuDoubleComplex,
+        d: *const cuda_types::cusparse::cuDoubleComplex,
+        du: *const cuda_types::cusparse::cuDoubleComplex,
+        x: *const cuda_types::cusparse::cuDoubleComplex,
+        batchCount: ::core::ffi::c_int,
+        batchStride: ::core::ffi::c_int,
+        bufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSgtsv2StridedBatch(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        dl: *const f32,
+        d: *const f32,
+        du: *const f32,
+        x: *mut f32,
+        batchCount: ::core::ffi::c_int,
+        batchStride: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDgtsv2StridedBatch(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        dl: *const f64,
+        d: *const f64,
+        du: *const f64,
+        x: *mut f64,
+        batchCount: ::core::ffi::c_int,
+        batchStride: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCgtsv2StridedBatch(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        dl: *const cuda_types::cusparse::cuComplex,
+        d: *const cuda_types::cusparse::cuComplex,
+        du: *const cuda_types::cusparse::cuComplex,
+        x: *mut cuda_types::cusparse::cuComplex,
+        batchCount: ::core::ffi::c_int,
+        batchStride: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZgtsv2StridedBatch(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        dl: *const cuda_types::cusparse::cuDoubleComplex,
+        d: *const cuda_types::cusparse::cuDoubleComplex,
+        du: *const cuda_types::cusparse::cuDoubleComplex,
+        x: *mut cuda_types::cusparse::cuDoubleComplex,
+        batchCount: ::core::ffi::c_int,
+        batchStride: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSgtsvInterleavedBatch_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        algo: ::core::ffi::c_int,
+        m: ::core::ffi::c_int,
+        dl: *const f32,
+        d: *const f32,
+        du: *const f32,
+        x: *const f32,
+        batchCount: ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDgtsvInterleavedBatch_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        algo: ::core::ffi::c_int,
+        m: ::core::ffi::c_int,
+        dl: *const f64,
+        d: *const f64,
+        du: *const f64,
+        x: *const f64,
+        batchCount: ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCgtsvInterleavedBatch_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        algo: ::core::ffi::c_int,
+        m: ::core::ffi::c_int,
+        dl: *const cuda_types::cusparse::cuComplex,
+        d: *const cuda_types::cusparse::cuComplex,
+        du: *const cuda_types::cusparse::cuComplex,
+        x: *const cuda_types::cusparse::cuComplex,
+        batchCount: ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZgtsvInterleavedBatch_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        algo: ::core::ffi::c_int,
+        m: ::core::ffi::c_int,
+        dl: *const cuda_types::cusparse::cuDoubleComplex,
+        d: *const cuda_types::cusparse::cuDoubleComplex,
+        du: *const cuda_types::cusparse::cuDoubleComplex,
+        x: *const cuda_types::cusparse::cuDoubleComplex,
+        batchCount: ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSgtsvInterleavedBatch(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        algo: ::core::ffi::c_int,
+        m: ::core::ffi::c_int,
+        dl: *mut f32,
+        d: *mut f32,
+        du: *mut f32,
+        x: *mut f32,
+        batchCount: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDgtsvInterleavedBatch(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        algo: ::core::ffi::c_int,
+        m: ::core::ffi::c_int,
+        dl: *mut f64,
+        d: *mut f64,
+        du: *mut f64,
+        x: *mut f64,
+        batchCount: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCgtsvInterleavedBatch(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        algo: ::core::ffi::c_int,
+        m: ::core::ffi::c_int,
+        dl: *mut cuda_types::cusparse::cuComplex,
+        d: *mut cuda_types::cusparse::cuComplex,
+        du: *mut cuda_types::cusparse::cuComplex,
+        x: *mut cuda_types::cusparse::cuComplex,
+        batchCount: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZgtsvInterleavedBatch(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        algo: ::core::ffi::c_int,
+        m: ::core::ffi::c_int,
+        dl: *mut cuda_types::cusparse::cuDoubleComplex,
+        d: *mut cuda_types::cusparse::cuDoubleComplex,
+        du: *mut cuda_types::cusparse::cuDoubleComplex,
+        x: *mut cuda_types::cusparse::cuDoubleComplex,
+        batchCount: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSgpsvInterleavedBatch_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        algo: ::core::ffi::c_int,
+        m: ::core::ffi::c_int,
+        ds: *const f32,
+        dl: *const f32,
+        d: *const f32,
+        du: *const f32,
+        dw: *const f32,
+        x: *const f32,
+        batchCount: ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDgpsvInterleavedBatch_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        algo: ::core::ffi::c_int,
+        m: ::core::ffi::c_int,
+        ds: *const f64,
+        dl: *const f64,
+        d: *const f64,
+        du: *const f64,
+        dw: *const f64,
+        x: *const f64,
+        batchCount: ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCgpsvInterleavedBatch_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        algo: ::core::ffi::c_int,
+        m: ::core::ffi::c_int,
+        ds: *const cuda_types::cusparse::cuComplex,
+        dl: *const cuda_types::cusparse::cuComplex,
+        d: *const cuda_types::cusparse::cuComplex,
+        du: *const cuda_types::cusparse::cuComplex,
+        dw: *const cuda_types::cusparse::cuComplex,
+        x: *const cuda_types::cusparse::cuComplex,
+        batchCount: ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZgpsvInterleavedBatch_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        algo: ::core::ffi::c_int,
+        m: ::core::ffi::c_int,
+        ds: *const cuda_types::cusparse::cuDoubleComplex,
+        dl: *const cuda_types::cusparse::cuDoubleComplex,
+        d: *const cuda_types::cusparse::cuDoubleComplex,
+        du: *const cuda_types::cusparse::cuDoubleComplex,
+        dw: *const cuda_types::cusparse::cuDoubleComplex,
+        x: *const cuda_types::cusparse::cuDoubleComplex,
+        batchCount: ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSgpsvInterleavedBatch(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        algo: ::core::ffi::c_int,
+        m: ::core::ffi::c_int,
+        ds: *mut f32,
+        dl: *mut f32,
+        d: *mut f32,
+        du: *mut f32,
+        dw: *mut f32,
+        x: *mut f32,
+        batchCount: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDgpsvInterleavedBatch(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        algo: ::core::ffi::c_int,
+        m: ::core::ffi::c_int,
+        ds: *mut f64,
+        dl: *mut f64,
+        d: *mut f64,
+        du: *mut f64,
+        dw: *mut f64,
+        x: *mut f64,
+        batchCount: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCgpsvInterleavedBatch(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        algo: ::core::ffi::c_int,
+        m: ::core::ffi::c_int,
+        ds: *mut cuda_types::cusparse::cuComplex,
+        dl: *mut cuda_types::cusparse::cuComplex,
+        d: *mut cuda_types::cusparse::cuComplex,
+        du: *mut cuda_types::cusparse::cuComplex,
+        dw: *mut cuda_types::cusparse::cuComplex,
+        x: *mut cuda_types::cusparse::cuComplex,
+        batchCount: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZgpsvInterleavedBatch(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        algo: ::core::ffi::c_int,
+        m: ::core::ffi::c_int,
+        ds: *mut cuda_types::cusparse::cuDoubleComplex,
+        dl: *mut cuda_types::cusparse::cuDoubleComplex,
+        d: *mut cuda_types::cusparse::cuDoubleComplex,
+        du: *mut cuda_types::cusparse::cuDoubleComplex,
+        dw: *mut cuda_types::cusparse::cuDoubleComplex,
+        x: *mut cuda_types::cusparse::cuDoubleComplex,
+        batchCount: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseScsrgeam2_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        nnzA: ::core::ffi::c_int,
+        csrSortedValA: *const f32,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        beta: *const f32,
+        descrB: cuda_types::cusparse::cusparseMatDescr_t,
+        nnzB: ::core::ffi::c_int,
+        csrSortedValB: *const f32,
+        csrSortedRowPtrB: *const ::core::ffi::c_int,
+        csrSortedColIndB: *const ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *const f32,
+        csrSortedRowPtrC: *const ::core::ffi::c_int,
+        csrSortedColIndC: *const ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDcsrgeam2_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f64,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        nnzA: ::core::ffi::c_int,
+        csrSortedValA: *const f64,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        beta: *const f64,
+        descrB: cuda_types::cusparse::cusparseMatDescr_t,
+        nnzB: ::core::ffi::c_int,
+        csrSortedValB: *const f64,
+        csrSortedRowPtrB: *const ::core::ffi::c_int,
+        csrSortedColIndB: *const ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *const f64,
+        csrSortedRowPtrC: *const ::core::ffi::c_int,
+        csrSortedColIndC: *const ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCcsrgeam2_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cusparse::cuComplex,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        nnzA: ::core::ffi::c_int,
+        csrSortedValA: *const cuda_types::cusparse::cuComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        beta: *const cuda_types::cusparse::cuComplex,
+        descrB: cuda_types::cusparse::cusparseMatDescr_t,
+        nnzB: ::core::ffi::c_int,
+        csrSortedValB: *const cuda_types::cusparse::cuComplex,
+        csrSortedRowPtrB: *const ::core::ffi::c_int,
+        csrSortedColIndB: *const ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *const cuda_types::cusparse::cuComplex,
+        csrSortedRowPtrC: *const ::core::ffi::c_int,
+        csrSortedColIndC: *const ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZcsrgeam2_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cusparse::cuDoubleComplex,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        nnzA: ::core::ffi::c_int,
+        csrSortedValA: *const cuda_types::cusparse::cuDoubleComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        beta: *const cuda_types::cusparse::cuDoubleComplex,
+        descrB: cuda_types::cusparse::cusparseMatDescr_t,
+        nnzB: ::core::ffi::c_int,
+        csrSortedValB: *const cuda_types::cusparse::cuDoubleComplex,
+        csrSortedRowPtrB: *const ::core::ffi::c_int,
+        csrSortedColIndB: *const ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *const cuda_types::cusparse::cuDoubleComplex,
+        csrSortedRowPtrC: *const ::core::ffi::c_int,
+        csrSortedColIndC: *const ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseXcsrgeam2Nnz(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        nnzA: ::core::ffi::c_int,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        descrB: cuda_types::cusparse::cusparseMatDescr_t,
+        nnzB: ::core::ffi::c_int,
+        csrSortedRowPtrB: *const ::core::ffi::c_int,
+        csrSortedColIndB: *const ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedRowPtrC: *mut ::core::ffi::c_int,
+        nnzTotalDevHostPtr: *mut ::core::ffi::c_int,
+        workspace: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseScsrgeam2(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f32,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        nnzA: ::core::ffi::c_int,
+        csrSortedValA: *const f32,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        beta: *const f32,
+        descrB: cuda_types::cusparse::cusparseMatDescr_t,
+        nnzB: ::core::ffi::c_int,
+        csrSortedValB: *const f32,
+        csrSortedRowPtrB: *const ::core::ffi::c_int,
+        csrSortedColIndB: *const ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *mut f32,
+        csrSortedRowPtrC: *mut ::core::ffi::c_int,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDcsrgeam2(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const f64,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        nnzA: ::core::ffi::c_int,
+        csrSortedValA: *const f64,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        beta: *const f64,
+        descrB: cuda_types::cusparse::cusparseMatDescr_t,
+        nnzB: ::core::ffi::c_int,
+        csrSortedValB: *const f64,
+        csrSortedRowPtrB: *const ::core::ffi::c_int,
+        csrSortedColIndB: *const ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *mut f64,
+        csrSortedRowPtrC: *mut ::core::ffi::c_int,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCcsrgeam2(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cusparse::cuComplex,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        nnzA: ::core::ffi::c_int,
+        csrSortedValA: *const cuda_types::cusparse::cuComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        beta: *const cuda_types::cusparse::cuComplex,
+        descrB: cuda_types::cusparse::cusparseMatDescr_t,
+        nnzB: ::core::ffi::c_int,
+        csrSortedValB: *const cuda_types::cusparse::cuComplex,
+        csrSortedRowPtrB: *const ::core::ffi::c_int,
+        csrSortedColIndB: *const ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *mut cuda_types::cusparse::cuComplex,
+        csrSortedRowPtrC: *mut ::core::ffi::c_int,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZcsrgeam2(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        alpha: *const cuda_types::cusparse::cuDoubleComplex,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        nnzA: ::core::ffi::c_int,
+        csrSortedValA: *const cuda_types::cusparse::cuDoubleComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        beta: *const cuda_types::cusparse::cuDoubleComplex,
+        descrB: cuda_types::cusparse::cusparseMatDescr_t,
+        nnzB: ::core::ffi::c_int,
+        csrSortedValB: *const cuda_types::cusparse::cuDoubleComplex,
+        csrSortedRowPtrB: *const ::core::ffi::c_int,
+        csrSortedColIndB: *const ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *mut cuda_types::cusparse::cuDoubleComplex,
+        csrSortedRowPtrC: *mut ::core::ffi::c_int,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseScsrcolor(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f32,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        fractionToColor: *const f32,
+        ncolors: *mut ::core::ffi::c_int,
+        coloring: *mut ::core::ffi::c_int,
+        reordering: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::cusparseColorInfo_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDcsrcolor(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f64,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        fractionToColor: *const f64,
+        ncolors: *mut ::core::ffi::c_int,
+        coloring: *mut ::core::ffi::c_int,
+        reordering: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::cusparseColorInfo_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCcsrcolor(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const cuda_types::cusparse::cuComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        fractionToColor: *const f32,
+        ncolors: *mut ::core::ffi::c_int,
+        coloring: *mut ::core::ffi::c_int,
+        reordering: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::cusparseColorInfo_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZcsrcolor(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const cuda_types::cusparse::cuDoubleComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        fractionToColor: *const f64,
+        ncolors: *mut ::core::ffi::c_int,
+        coloring: *mut ::core::ffi::c_int,
+        reordering: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::cusparseColorInfo_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSnnz(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        nnzPerRowCol: *mut ::core::ffi::c_int,
+        nnzTotalDevHostPtr: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDnnz(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        nnzPerRowCol: *mut ::core::ffi::c_int,
+        nnzTotalDevHostPtr: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCnnz(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        A: *const cuda_types::cusparse::cuComplex,
+        lda: ::core::ffi::c_int,
+        nnzPerRowCol: *mut ::core::ffi::c_int,
+        nnzTotalDevHostPtr: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZnnz(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        A: *const cuda_types::cusparse::cuDoubleComplex,
+        lda: ::core::ffi::c_int,
+        nnzPerRowCol: *mut ::core::ffi::c_int,
+        nnzTotalDevHostPtr: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSnnz_compress(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        descr: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f32,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        nnzPerRow: *mut ::core::ffi::c_int,
+        nnzC: *mut ::core::ffi::c_int,
+        tol: f32,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDnnz_compress(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        descr: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f64,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        nnzPerRow: *mut ::core::ffi::c_int,
+        nnzC: *mut ::core::ffi::c_int,
+        tol: f64,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCnnz_compress(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        descr: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const cuda_types::cusparse::cuComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        nnzPerRow: *mut ::core::ffi::c_int,
+        nnzC: *mut ::core::ffi::c_int,
+        tol: cuda_types::cusparse::cuComplex,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZnnz_compress(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        descr: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const cuda_types::cusparse::cuDoubleComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        nnzPerRow: *mut ::core::ffi::c_int,
+        nnzC: *mut ::core::ffi::c_int,
+        tol: cuda_types::cusparse::cuDoubleComplex,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseScsr2csr_compress(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f32,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        nnzA: ::core::ffi::c_int,
+        nnzPerRow: *const ::core::ffi::c_int,
+        csrSortedValC: *mut f32,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+        csrSortedRowPtrC: *mut ::core::ffi::c_int,
+        tol: f32,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDcsr2csr_compress(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f64,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        nnzA: ::core::ffi::c_int,
+        nnzPerRow: *const ::core::ffi::c_int,
+        csrSortedValC: *mut f64,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+        csrSortedRowPtrC: *mut ::core::ffi::c_int,
+        tol: f64,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCcsr2csr_compress(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const cuda_types::cusparse::cuComplex,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        nnzA: ::core::ffi::c_int,
+        nnzPerRow: *const ::core::ffi::c_int,
+        csrSortedValC: *mut cuda_types::cusparse::cuComplex,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+        csrSortedRowPtrC: *mut ::core::ffi::c_int,
+        tol: cuda_types::cusparse::cuComplex,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZcsr2csr_compress(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const cuda_types::cusparse::cuDoubleComplex,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        nnzA: ::core::ffi::c_int,
+        nnzPerRow: *const ::core::ffi::c_int,
+        csrSortedValC: *mut cuda_types::cusparse::cuDoubleComplex,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+        csrSortedRowPtrC: *mut ::core::ffi::c_int,
+        tol: cuda_types::cusparse::cuDoubleComplex,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseXcoo2csr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        cooRowInd: *const ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        m: ::core::ffi::c_int,
+        csrSortedRowPtr: *mut ::core::ffi::c_int,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseXcsr2coo(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        csrSortedRowPtr: *const ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        m: ::core::ffi::c_int,
+        cooRowInd: *mut ::core::ffi::c_int,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseXcsr2bsrNnz(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedRowPtrC: *mut ::core::ffi::c_int,
+        nnzTotalDevHostPtr: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseScsr2bsr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f32,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValC: *mut f32,
+        bsrSortedRowPtrC: *mut ::core::ffi::c_int,
+        bsrSortedColIndC: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDcsr2bsr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f64,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValC: *mut f64,
+        bsrSortedRowPtrC: *mut ::core::ffi::c_int,
+        bsrSortedColIndC: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCcsr2bsr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const cuda_types::cusparse::cuComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValC: *mut cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtrC: *mut ::core::ffi::c_int,
+        bsrSortedColIndC: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZcsr2bsr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const cuda_types::cusparse::cuDoubleComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValC: *mut cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtrC: *mut ::core::ffi::c_int,
+        bsrSortedColIndC: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSbsr2csr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const f32,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *mut f32,
+        csrSortedRowPtrC: *mut ::core::ffi::c_int,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDbsr2csr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const f64,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *mut f64,
+        csrSortedRowPtrC: *mut ::core::ffi::c_int,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCbsr2csr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *mut cuda_types::cusparse::cuComplex,
+        csrSortedRowPtrC: *mut ::core::ffi::c_int,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZbsr2csr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        blockDim: ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *mut cuda_types::cusparse::cuDoubleComplex,
+        csrSortedRowPtrC: *mut ::core::ffi::c_int,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSgebsr2gebsc_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        bsrSortedVal: *const f32,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDgebsr2gebsc_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        bsrSortedVal: *const f64,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCgebsr2gebsc_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        bsrSortedVal: *const cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZgebsr2gebsc_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        bsrSortedVal: *const cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSgebsr2gebsc_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        bsrSortedVal: *const f32,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDgebsr2gebsc_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        bsrSortedVal: *const f64,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCgebsr2gebsc_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        bsrSortedVal: *const cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZgebsr2gebsc_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        bsrSortedVal: *const cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSgebsr2gebsc(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        bsrSortedVal: *const f32,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        bscVal: *mut f32,
+        bscRowInd: *mut ::core::ffi::c_int,
+        bscColPtr: *mut ::core::ffi::c_int,
+        copyValues: cuda_types::cusparse::cusparseAction_t,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDgebsr2gebsc(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        bsrSortedVal: *const f64,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        bscVal: *mut f64,
+        bscRowInd: *mut ::core::ffi::c_int,
+        bscColPtr: *mut ::core::ffi::c_int,
+        copyValues: cuda_types::cusparse::cusparseAction_t,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCgebsr2gebsc(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        bsrSortedVal: *const cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        bscVal: *mut cuda_types::cusparse::cuComplex,
+        bscRowInd: *mut ::core::ffi::c_int,
+        bscColPtr: *mut ::core::ffi::c_int,
+        copyValues: cuda_types::cusparse::cusparseAction_t,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZgebsr2gebsc(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        bsrSortedVal: *const cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtr: *const ::core::ffi::c_int,
+        bsrSortedColInd: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        bscVal: *mut cuda_types::cusparse::cuDoubleComplex,
+        bscRowInd: *mut ::core::ffi::c_int,
+        bscColPtr: *mut ::core::ffi::c_int,
+        copyValues: cuda_types::cusparse::cusparseAction_t,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseXgebsr2csr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedRowPtrC: *mut ::core::ffi::c_int,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSgebsr2csr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const f32,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *mut f32,
+        csrSortedRowPtrC: *mut ::core::ffi::c_int,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDgebsr2csr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const f64,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *mut f64,
+        csrSortedRowPtrC: *mut ::core::ffi::c_int,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCgebsr2csr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *mut cuda_types::cusparse::cuComplex,
+        csrSortedRowPtrC: *mut ::core::ffi::c_int,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZgebsr2csr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *mut cuda_types::cusparse::cuDoubleComplex,
+        csrSortedRowPtrC: *mut ::core::ffi::c_int,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseScsr2gebsr_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f32,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDcsr2gebsr_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f64,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCcsr2gebsr_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const cuda_types::cusparse::cuComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZcsr2gebsr_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const cuda_types::cusparse::cuDoubleComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseScsr2gebsr_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f32,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDcsr2gebsr_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f64,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCcsr2gebsr_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const cuda_types::cusparse::cuComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZcsr2gebsr_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const cuda_types::cusparse::cuDoubleComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseXcsr2gebsrNnz(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedRowPtrC: *mut ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        nnzTotalDevHostPtr: *mut ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseScsr2gebsr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f32,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValC: *mut f32,
+        bsrSortedRowPtrC: *mut ::core::ffi::c_int,
+        bsrSortedColIndC: *mut ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDcsr2gebsr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f64,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValC: *mut f64,
+        bsrSortedRowPtrC: *mut ::core::ffi::c_int,
+        bsrSortedColIndC: *mut ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCcsr2gebsr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const cuda_types::cusparse::cuComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValC: *mut cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtrC: *mut ::core::ffi::c_int,
+        bsrSortedColIndC: *mut ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZcsr2gebsr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const cuda_types::cusparse::cuDoubleComplex,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValC: *mut cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtrC: *mut ::core::ffi::c_int,
+        bsrSortedColIndC: *mut ::core::ffi::c_int,
+        rowBlockDim: ::core::ffi::c_int,
+        colBlockDim: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSgebsr2gebsr_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const f32,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDimA: ::core::ffi::c_int,
+        colBlockDimA: ::core::ffi::c_int,
+        rowBlockDimC: ::core::ffi::c_int,
+        colBlockDimC: ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDgebsr2gebsr_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const f64,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDimA: ::core::ffi::c_int,
+        colBlockDimA: ::core::ffi::c_int,
+        rowBlockDimC: ::core::ffi::c_int,
+        colBlockDimC: ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCgebsr2gebsr_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDimA: ::core::ffi::c_int,
+        colBlockDimA: ::core::ffi::c_int,
+        rowBlockDimC: ::core::ffi::c_int,
+        colBlockDimC: ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZgebsr2gebsr_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDimA: ::core::ffi::c_int,
+        colBlockDimA: ::core::ffi::c_int,
+        rowBlockDimC: ::core::ffi::c_int,
+        colBlockDimC: ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSgebsr2gebsr_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const f32,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDimA: ::core::ffi::c_int,
+        colBlockDimA: ::core::ffi::c_int,
+        rowBlockDimC: ::core::ffi::c_int,
+        colBlockDimC: ::core::ffi::c_int,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDgebsr2gebsr_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const f64,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDimA: ::core::ffi::c_int,
+        colBlockDimA: ::core::ffi::c_int,
+        rowBlockDimC: ::core::ffi::c_int,
+        colBlockDimC: ::core::ffi::c_int,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCgebsr2gebsr_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDimA: ::core::ffi::c_int,
+        colBlockDimA: ::core::ffi::c_int,
+        rowBlockDimC: ::core::ffi::c_int,
+        colBlockDimC: ::core::ffi::c_int,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZgebsr2gebsr_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDimA: ::core::ffi::c_int,
+        colBlockDimA: ::core::ffi::c_int,
+        rowBlockDimC: ::core::ffi::c_int,
+        colBlockDimC: ::core::ffi::c_int,
+        pBufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseXgebsr2gebsrNnz(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDimA: ::core::ffi::c_int,
+        colBlockDimA: ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedRowPtrC: *mut ::core::ffi::c_int,
+        rowBlockDimC: ::core::ffi::c_int,
+        colBlockDimC: ::core::ffi::c_int,
+        nnzTotalDevHostPtr: *mut ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSgebsr2gebsr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const f32,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDimA: ::core::ffi::c_int,
+        colBlockDimA: ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValC: *mut f32,
+        bsrSortedRowPtrC: *mut ::core::ffi::c_int,
+        bsrSortedColIndC: *mut ::core::ffi::c_int,
+        rowBlockDimC: ::core::ffi::c_int,
+        colBlockDimC: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDgebsr2gebsr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const f64,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDimA: ::core::ffi::c_int,
+        colBlockDimA: ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValC: *mut f64,
+        bsrSortedRowPtrC: *mut ::core::ffi::c_int,
+        bsrSortedColIndC: *mut ::core::ffi::c_int,
+        rowBlockDimC: ::core::ffi::c_int,
+        colBlockDimC: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCgebsr2gebsr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDimA: ::core::ffi::c_int,
+        colBlockDimA: ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValC: *mut cuda_types::cusparse::cuComplex,
+        bsrSortedRowPtrC: *mut ::core::ffi::c_int,
+        bsrSortedColIndC: *mut ::core::ffi::c_int,
+        rowBlockDimC: ::core::ffi::c_int,
+        colBlockDimC: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZgebsr2gebsr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        dirA: cuda_types::cusparse::cusparseDirection_t,
+        mb: ::core::ffi::c_int,
+        nb: ::core::ffi::c_int,
+        nnzb: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValA: *const cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtrA: *const ::core::ffi::c_int,
+        bsrSortedColIndA: *const ::core::ffi::c_int,
+        rowBlockDimA: ::core::ffi::c_int,
+        colBlockDimA: ::core::ffi::c_int,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        bsrSortedValC: *mut cuda_types::cusparse::cuDoubleComplex,
+        bsrSortedRowPtrC: *mut ::core::ffi::c_int,
+        bsrSortedColIndC: *mut ::core::ffi::c_int,
+        rowBlockDimC: ::core::ffi::c_int,
+        colBlockDimC: ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateIdentityPermutation(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        n: ::core::ffi::c_int,
+        p: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseXcoosort_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        cooRowsA: *const ::core::ffi::c_int,
+        cooColsA: *const ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseXcoosortByRow(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        cooRowsA: *mut ::core::ffi::c_int,
+        cooColsA: *mut ::core::ffi::c_int,
+        P: *mut ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseXcoosortByColumn(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        cooRowsA: *mut ::core::ffi::c_int,
+        cooColsA: *mut ::core::ffi::c_int,
+        P: *mut ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseXcsrsort_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        csrRowPtrA: *const ::core::ffi::c_int,
+        csrColIndA: *const ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseXcsrsort(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrRowPtrA: *const ::core::ffi::c_int,
+        csrColIndA: *mut ::core::ffi::c_int,
+        P: *mut ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseXcscsort_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        cscColPtrA: *const ::core::ffi::c_int,
+        cscRowIndA: *const ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseXcscsort(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        cscColPtrA: *const ::core::ffi::c_int,
+        cscRowIndA: *mut ::core::ffi::c_int,
+        P: *mut ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseScsru2csr_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        csrVal: *mut f32,
+        csrRowPtr: *const ::core::ffi::c_int,
+        csrColInd: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::csru2csrInfo_t,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDcsru2csr_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        csrVal: *mut f64,
+        csrRowPtr: *const ::core::ffi::c_int,
+        csrColInd: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::csru2csrInfo_t,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCcsru2csr_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        csrVal: *mut cuda_types::cusparse::cuComplex,
+        csrRowPtr: *const ::core::ffi::c_int,
+        csrColInd: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::csru2csrInfo_t,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZcsru2csr_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        csrVal: *mut cuda_types::cusparse::cuDoubleComplex,
+        csrRowPtr: *const ::core::ffi::c_int,
+        csrColInd: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::csru2csrInfo_t,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseScsru2csr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrVal: *mut f32,
+        csrRowPtr: *const ::core::ffi::c_int,
+        csrColInd: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::csru2csrInfo_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDcsru2csr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrVal: *mut f64,
+        csrRowPtr: *const ::core::ffi::c_int,
+        csrColInd: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::csru2csrInfo_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCcsru2csr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrVal: *mut cuda_types::cusparse::cuComplex,
+        csrRowPtr: *const ::core::ffi::c_int,
+        csrColInd: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::csru2csrInfo_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZcsru2csr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrVal: *mut cuda_types::cusparse::cuDoubleComplex,
+        csrRowPtr: *const ::core::ffi::c_int,
+        csrColInd: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::csru2csrInfo_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseScsr2csru(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrVal: *mut f32,
+        csrRowPtr: *const ::core::ffi::c_int,
+        csrColInd: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::csru2csrInfo_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDcsr2csru(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrVal: *mut f64,
+        csrRowPtr: *const ::core::ffi::c_int,
+        csrColInd: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::csru2csrInfo_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCcsr2csru(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrVal: *mut cuda_types::cusparse::cuComplex,
+        csrRowPtr: *const ::core::ffi::c_int,
+        csrColInd: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::csru2csrInfo_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseZcsr2csru(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrVal: *mut cuda_types::cusparse::cuDoubleComplex,
+        csrRowPtr: *const ::core::ffi::c_int,
+        csrColInd: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::csru2csrInfo_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpruneDense2csr_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        threshold: *const f32,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *const f32,
+        csrSortedRowPtrC: *const ::core::ffi::c_int,
+        csrSortedColIndC: *const ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDpruneDense2csr_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        threshold: *const f64,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *const f64,
+        csrSortedRowPtrC: *const ::core::ffi::c_int,
+        csrSortedColIndC: *const ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpruneDense2csrNnz(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        threshold: *const f32,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrRowPtrC: *mut ::core::ffi::c_int,
+        nnzTotalDevHostPtr: *mut ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDpruneDense2csrNnz(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        threshold: *const f64,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedRowPtrC: *mut ::core::ffi::c_int,
+        nnzTotalDevHostPtr: *mut ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpruneDense2csr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        threshold: *const f32,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *mut f32,
+        csrSortedRowPtrC: *const ::core::ffi::c_int,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDpruneDense2csr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        threshold: *const f64,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *mut f64,
+        csrSortedRowPtrC: *const ::core::ffi::c_int,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpruneCsr2csr_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzA: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f32,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        threshold: *const f32,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *const f32,
+        csrSortedRowPtrC: *const ::core::ffi::c_int,
+        csrSortedColIndC: *const ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDpruneCsr2csr_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzA: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f64,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        threshold: *const f64,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *const f64,
+        csrSortedRowPtrC: *const ::core::ffi::c_int,
+        csrSortedColIndC: *const ::core::ffi::c_int,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpruneCsr2csrNnz(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzA: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f32,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        threshold: *const f32,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedRowPtrC: *mut ::core::ffi::c_int,
+        nnzTotalDevHostPtr: *mut ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDpruneCsr2csrNnz(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzA: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f64,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        threshold: *const f64,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedRowPtrC: *mut ::core::ffi::c_int,
+        nnzTotalDevHostPtr: *mut ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpruneCsr2csr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzA: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f32,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        threshold: *const f32,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *mut f32,
+        csrSortedRowPtrC: *const ::core::ffi::c_int,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDpruneCsr2csr(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzA: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f64,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        threshold: *const f64,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *mut f64,
+        csrSortedRowPtrC: *const ::core::ffi::c_int,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpruneDense2csrByPercentage_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        percentage: f32,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *const f32,
+        csrSortedRowPtrC: *const ::core::ffi::c_int,
+        csrSortedColIndC: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::pruneInfo_t,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDpruneDense2csrByPercentage_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        percentage: f32,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *const f64,
+        csrSortedRowPtrC: *const ::core::ffi::c_int,
+        csrSortedColIndC: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::pruneInfo_t,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpruneDense2csrNnzByPercentage(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        percentage: f32,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrRowPtrC: *mut ::core::ffi::c_int,
+        nnzTotalDevHostPtr: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::pruneInfo_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDpruneDense2csrNnzByPercentage(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        percentage: f32,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrRowPtrC: *mut ::core::ffi::c_int,
+        nnzTotalDevHostPtr: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::pruneInfo_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpruneDense2csrByPercentage(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        A: *const f32,
+        lda: ::core::ffi::c_int,
+        percentage: f32,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *mut f32,
+        csrSortedRowPtrC: *const ::core::ffi::c_int,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::pruneInfo_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDpruneDense2csrByPercentage(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        A: *const f64,
+        lda: ::core::ffi::c_int,
+        percentage: f32,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *mut f64,
+        csrSortedRowPtrC: *const ::core::ffi::c_int,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::pruneInfo_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpruneCsr2csrByPercentage_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzA: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f32,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        percentage: f32,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *const f32,
+        csrSortedRowPtrC: *const ::core::ffi::c_int,
+        csrSortedColIndC: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::pruneInfo_t,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDpruneCsr2csrByPercentage_bufferSizeExt(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzA: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f64,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        percentage: f32,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *const f64,
+        csrSortedRowPtrC: *const ::core::ffi::c_int,
+        csrSortedColIndC: *const ::core::ffi::c_int,
+        info: cuda_types::cusparse::pruneInfo_t,
+        pBufferSizeInBytes: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpruneCsr2csrNnzByPercentage(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzA: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f32,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        percentage: f32,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedRowPtrC: *mut ::core::ffi::c_int,
+        nnzTotalDevHostPtr: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::pruneInfo_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDpruneCsr2csrNnzByPercentage(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzA: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f64,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        percentage: f32,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedRowPtrC: *mut ::core::ffi::c_int,
+        nnzTotalDevHostPtr: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::pruneInfo_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpruneCsr2csrByPercentage(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzA: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f32,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        percentage: f32,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *mut f32,
+        csrSortedRowPtrC: *const ::core::ffi::c_int,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::pruneInfo_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDpruneCsr2csrByPercentage(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnzA: ::core::ffi::c_int,
+        descrA: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValA: *const f64,
+        csrSortedRowPtrA: *const ::core::ffi::c_int,
+        csrSortedColIndA: *const ::core::ffi::c_int,
+        percentage: f32,
+        descrC: cuda_types::cusparse::cusparseMatDescr_t,
+        csrSortedValC: *mut f64,
+        csrSortedRowPtrC: *const ::core::ffi::c_int,
+        csrSortedColIndC: *mut ::core::ffi::c_int,
+        info: cuda_types::cusparse::pruneInfo_t,
+        pBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCsr2cscEx2(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        csrVal: *const ::core::ffi::c_void,
+        csrRowPtr: *const ::core::ffi::c_int,
+        csrColInd: *const ::core::ffi::c_int,
+        cscVal: *mut ::core::ffi::c_void,
+        cscColPtr: *mut ::core::ffi::c_int,
+        cscRowInd: *mut ::core::ffi::c_int,
+        valType: cuda_types::cusparse::cudaDataType,
+        copyValues: cuda_types::cusparse::cusparseAction_t,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        alg: cuda_types::cusparse::cusparseCsr2CscAlg_t,
+        buffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCsr2cscEx2_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        m: ::core::ffi::c_int,
+        n: ::core::ffi::c_int,
+        nnz: ::core::ffi::c_int,
+        csrVal: *const ::core::ffi::c_void,
+        csrRowPtr: *const ::core::ffi::c_int,
+        csrColInd: *const ::core::ffi::c_int,
+        cscVal: *mut ::core::ffi::c_void,
+        cscColPtr: *mut ::core::ffi::c_int,
+        cscRowInd: *mut ::core::ffi::c_int,
+        valType: cuda_types::cusparse::cudaDataType,
+        copyValues: cuda_types::cusparse::cusparseAction_t,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        alg: cuda_types::cusparse::cusparseCsr2CscAlg_t,
+        bufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateSpVec(
+        spVecDescr: *mut cuda_types::cusparse::cusparseSpVecDescr_t,
+        size: i64,
+        nnz: i64,
+        indices: *mut ::core::ffi::c_void,
+        values: *mut ::core::ffi::c_void,
+        idxType: cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateConstSpVec(
+        spVecDescr: *mut cuda_types::cusparse::cusparseConstSpVecDescr_t,
+        size: i64,
+        nnz: i64,
+        indices: *const ::core::ffi::c_void,
+        values: *const ::core::ffi::c_void,
+        idxType: cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDestroySpVec(
+        spVecDescr: cuda_types::cusparse::cusparseConstSpVecDescr_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpVecGet(
+        spVecDescr: cuda_types::cusparse::cusparseSpVecDescr_t,
+        size: *mut i64,
+        nnz: *mut i64,
+        indices: *mut *mut ::core::ffi::c_void,
+        values: *mut *mut ::core::ffi::c_void,
+        idxType: *mut cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: *mut cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: *mut cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseConstSpVecGet(
+        spVecDescr: cuda_types::cusparse::cusparseConstSpVecDescr_t,
+        size: *mut i64,
+        nnz: *mut i64,
+        indices: *mut *const ::core::ffi::c_void,
+        values: *mut *const ::core::ffi::c_void,
+        idxType: *mut cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: *mut cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: *mut cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpVecGetIndexBase(
+        spVecDescr: cuda_types::cusparse::cusparseConstSpVecDescr_t,
+        idxBase: *mut cuda_types::cusparse::cusparseIndexBase_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpVecGetValues(
+        spVecDescr: cuda_types::cusparse::cusparseSpVecDescr_t,
+        values: *mut *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseConstSpVecGetValues(
+        spVecDescr: cuda_types::cusparse::cusparseConstSpVecDescr_t,
+        values: *mut *const ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpVecSetValues(
+        spVecDescr: cuda_types::cusparse::cusparseSpVecDescr_t,
+        values: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateDnVec(
+        dnVecDescr: *mut cuda_types::cusparse::cusparseDnVecDescr_t,
+        size: i64,
+        values: *mut ::core::ffi::c_void,
+        valueType: cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateConstDnVec(
+        dnVecDescr: *mut cuda_types::cusparse::cusparseConstDnVecDescr_t,
+        size: i64,
+        values: *const ::core::ffi::c_void,
+        valueType: cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDestroyDnVec(
+        dnVecDescr: cuda_types::cusparse::cusparseConstDnVecDescr_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDnVecGet(
+        dnVecDescr: cuda_types::cusparse::cusparseDnVecDescr_t,
+        size: *mut i64,
+        values: *mut *mut ::core::ffi::c_void,
+        valueType: *mut cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseConstDnVecGet(
+        dnVecDescr: cuda_types::cusparse::cusparseConstDnVecDescr_t,
+        size: *mut i64,
+        values: *mut *const ::core::ffi::c_void,
+        valueType: *mut cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDnVecGetValues(
+        dnVecDescr: cuda_types::cusparse::cusparseDnVecDescr_t,
+        values: *mut *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseConstDnVecGetValues(
+        dnVecDescr: cuda_types::cusparse::cusparseConstDnVecDescr_t,
+        values: *mut *const ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDnVecSetValues(
+        dnVecDescr: cuda_types::cusparse::cusparseDnVecDescr_t,
+        values: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDestroySpMat(
+        spMatDescr: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpMatGetFormat(
+        spMatDescr: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        format: *mut cuda_types::cusparse::cusparseFormat_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpMatGetIndexBase(
+        spMatDescr: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        idxBase: *mut cuda_types::cusparse::cusparseIndexBase_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpMatGetValues(
+        spMatDescr: cuda_types::cusparse::cusparseSpMatDescr_t,
+        values: *mut *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseConstSpMatGetValues(
+        spMatDescr: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        values: *mut *const ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpMatSetValues(
+        spMatDescr: cuda_types::cusparse::cusparseSpMatDescr_t,
+        values: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpMatGetSize(
+        spMatDescr: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        rows: *mut i64,
+        cols: *mut i64,
+        nnz: *mut i64,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpMatGetStridedBatch(
+        spMatDescr: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        batchCount: *mut ::core::ffi::c_int,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCooSetStridedBatch(
+        spMatDescr: cuda_types::cusparse::cusparseSpMatDescr_t,
+        batchCount: ::core::ffi::c_int,
+        batchStride: i64,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCsrSetStridedBatch(
+        spMatDescr: cuda_types::cusparse::cusparseSpMatDescr_t,
+        batchCount: ::core::ffi::c_int,
+        offsetsBatchStride: i64,
+        columnsValuesBatchStride: i64,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseBsrSetStridedBatch(
+        spMatDescr: cuda_types::cusparse::cusparseSpMatDescr_t,
+        batchCount: ::core::ffi::c_int,
+        offsetsBatchStride: i64,
+        columnsBatchStride: i64,
+        ValuesBatchStride: i64,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpMatGetAttribute(
+        spMatDescr: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        attribute: cuda_types::cusparse::cusparseSpMatAttribute_t,
+        data: *mut ::core::ffi::c_void,
+        dataSize: usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpMatSetAttribute(
+        spMatDescr: cuda_types::cusparse::cusparseSpMatDescr_t,
+        attribute: cuda_types::cusparse::cusparseSpMatAttribute_t,
+        data: *mut ::core::ffi::c_void,
+        dataSize: usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateCsr(
+        spMatDescr: *mut cuda_types::cusparse::cusparseSpMatDescr_t,
+        rows: i64,
+        cols: i64,
+        nnz: i64,
+        csrRowOffsets: *mut ::core::ffi::c_void,
+        csrColInd: *mut ::core::ffi::c_void,
+        csrValues: *mut ::core::ffi::c_void,
+        csrRowOffsetsType: cuda_types::cusparse::cusparseIndexType_t,
+        csrColIndType: cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateConstCsr(
+        spMatDescr: *mut cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        rows: i64,
+        cols: i64,
+        nnz: i64,
+        csrRowOffsets: *const ::core::ffi::c_void,
+        csrColInd: *const ::core::ffi::c_void,
+        csrValues: *const ::core::ffi::c_void,
+        csrRowOffsetsType: cuda_types::cusparse::cusparseIndexType_t,
+        csrColIndType: cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateCsc(
+        spMatDescr: *mut cuda_types::cusparse::cusparseSpMatDescr_t,
+        rows: i64,
+        cols: i64,
+        nnz: i64,
+        cscColOffsets: *mut ::core::ffi::c_void,
+        cscRowInd: *mut ::core::ffi::c_void,
+        cscValues: *mut ::core::ffi::c_void,
+        cscColOffsetsType: cuda_types::cusparse::cusparseIndexType_t,
+        cscRowIndType: cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateConstCsc(
+        spMatDescr: *mut cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        rows: i64,
+        cols: i64,
+        nnz: i64,
+        cscColOffsets: *const ::core::ffi::c_void,
+        cscRowInd: *const ::core::ffi::c_void,
+        cscValues: *const ::core::ffi::c_void,
+        cscColOffsetsType: cuda_types::cusparse::cusparseIndexType_t,
+        cscRowIndType: cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCsrGet(
+        spMatDescr: cuda_types::cusparse::cusparseSpMatDescr_t,
+        rows: *mut i64,
+        cols: *mut i64,
+        nnz: *mut i64,
+        csrRowOffsets: *mut *mut ::core::ffi::c_void,
+        csrColInd: *mut *mut ::core::ffi::c_void,
+        csrValues: *mut *mut ::core::ffi::c_void,
+        csrRowOffsetsType: *mut cuda_types::cusparse::cusparseIndexType_t,
+        csrColIndType: *mut cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: *mut cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: *mut cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseConstCsrGet(
+        spMatDescr: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        rows: *mut i64,
+        cols: *mut i64,
+        nnz: *mut i64,
+        csrRowOffsets: *mut *const ::core::ffi::c_void,
+        csrColInd: *mut *const ::core::ffi::c_void,
+        csrValues: *mut *const ::core::ffi::c_void,
+        csrRowOffsetsType: *mut cuda_types::cusparse::cusparseIndexType_t,
+        csrColIndType: *mut cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: *mut cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: *mut cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCscGet(
+        spMatDescr: cuda_types::cusparse::cusparseSpMatDescr_t,
+        rows: *mut i64,
+        cols: *mut i64,
+        nnz: *mut i64,
+        cscColOffsets: *mut *mut ::core::ffi::c_void,
+        cscRowInd: *mut *mut ::core::ffi::c_void,
+        cscValues: *mut *mut ::core::ffi::c_void,
+        cscColOffsetsType: *mut cuda_types::cusparse::cusparseIndexType_t,
+        cscRowIndType: *mut cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: *mut cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: *mut cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseConstCscGet(
+        spMatDescr: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        rows: *mut i64,
+        cols: *mut i64,
+        nnz: *mut i64,
+        cscColOffsets: *mut *const ::core::ffi::c_void,
+        cscRowInd: *mut *const ::core::ffi::c_void,
+        cscValues: *mut *const ::core::ffi::c_void,
+        cscColOffsetsType: *mut cuda_types::cusparse::cusparseIndexType_t,
+        cscRowIndType: *mut cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: *mut cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: *mut cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCsrSetPointers(
+        spMatDescr: cuda_types::cusparse::cusparseSpMatDescr_t,
+        csrRowOffsets: *mut ::core::ffi::c_void,
+        csrColInd: *mut ::core::ffi::c_void,
+        csrValues: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCscSetPointers(
+        spMatDescr: cuda_types::cusparse::cusparseSpMatDescr_t,
+        cscColOffsets: *mut ::core::ffi::c_void,
+        cscRowInd: *mut ::core::ffi::c_void,
+        cscValues: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateBsr(
+        spMatDescr: *mut cuda_types::cusparse::cusparseSpMatDescr_t,
+        brows: i64,
+        bcols: i64,
+        bnnz: i64,
+        rowBlockSize: i64,
+        colBlockSize: i64,
+        bsrRowOffsets: *mut ::core::ffi::c_void,
+        bsrColInd: *mut ::core::ffi::c_void,
+        bsrValues: *mut ::core::ffi::c_void,
+        bsrRowOffsetsType: cuda_types::cusparse::cusparseIndexType_t,
+        bsrColIndType: cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: cuda_types::cusparse::cudaDataType,
+        order: cuda_types::cusparse::cusparseOrder_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateConstBsr(
+        spMatDescr: *mut cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        brows: i64,
+        bcols: i64,
+        bnnz: i64,
+        rowBlockDim: i64,
+        colBlockDim: i64,
+        bsrRowOffsets: *const ::core::ffi::c_void,
+        bsrColInd: *const ::core::ffi::c_void,
+        bsrValues: *const ::core::ffi::c_void,
+        bsrRowOffsetsType: cuda_types::cusparse::cusparseIndexType_t,
+        bsrColIndType: cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: cuda_types::cusparse::cudaDataType,
+        order: cuda_types::cusparse::cusparseOrder_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateCoo(
+        spMatDescr: *mut cuda_types::cusparse::cusparseSpMatDescr_t,
+        rows: i64,
+        cols: i64,
+        nnz: i64,
+        cooRowInd: *mut ::core::ffi::c_void,
+        cooColInd: *mut ::core::ffi::c_void,
+        cooValues: *mut ::core::ffi::c_void,
+        cooIdxType: cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateConstCoo(
+        spMatDescr: *mut cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        rows: i64,
+        cols: i64,
+        nnz: i64,
+        cooRowInd: *const ::core::ffi::c_void,
+        cooColInd: *const ::core::ffi::c_void,
+        cooValues: *const ::core::ffi::c_void,
+        cooIdxType: cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCooGet(
+        spMatDescr: cuda_types::cusparse::cusparseSpMatDescr_t,
+        rows: *mut i64,
+        cols: *mut i64,
+        nnz: *mut i64,
+        cooRowInd: *mut *mut ::core::ffi::c_void,
+        cooColInd: *mut *mut ::core::ffi::c_void,
+        cooValues: *mut *mut ::core::ffi::c_void,
+        idxType: *mut cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: *mut cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: *mut cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseConstCooGet(
+        spMatDescr: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        rows: *mut i64,
+        cols: *mut i64,
+        nnz: *mut i64,
+        cooRowInd: *mut *const ::core::ffi::c_void,
+        cooColInd: *mut *const ::core::ffi::c_void,
+        cooValues: *mut *const ::core::ffi::c_void,
+        idxType: *mut cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: *mut cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: *mut cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCooSetPointers(
+        spMatDescr: cuda_types::cusparse::cusparseSpMatDescr_t,
+        cooRows: *mut ::core::ffi::c_void,
+        cooColumns: *mut ::core::ffi::c_void,
+        cooValues: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateBlockedEll(
+        spMatDescr: *mut cuda_types::cusparse::cusparseSpMatDescr_t,
+        rows: i64,
+        cols: i64,
+        ellBlockSize: i64,
+        ellCols: i64,
+        ellColInd: *mut ::core::ffi::c_void,
+        ellValue: *mut ::core::ffi::c_void,
+        ellIdxType: cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateConstBlockedEll(
+        spMatDescr: *mut cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        rows: i64,
+        cols: i64,
+        ellBlockSize: i64,
+        ellCols: i64,
+        ellColInd: *const ::core::ffi::c_void,
+        ellValue: *const ::core::ffi::c_void,
+        ellIdxType: cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseBlockedEllGet(
+        spMatDescr: cuda_types::cusparse::cusparseSpMatDescr_t,
+        rows: *mut i64,
+        cols: *mut i64,
+        ellBlockSize: *mut i64,
+        ellCols: *mut i64,
+        ellColInd: *mut *mut ::core::ffi::c_void,
+        ellValue: *mut *mut ::core::ffi::c_void,
+        ellIdxType: *mut cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: *mut cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: *mut cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseConstBlockedEllGet(
+        spMatDescr: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        rows: *mut i64,
+        cols: *mut i64,
+        ellBlockSize: *mut i64,
+        ellCols: *mut i64,
+        ellColInd: *mut *const ::core::ffi::c_void,
+        ellValue: *mut *const ::core::ffi::c_void,
+        ellIdxType: *mut cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: *mut cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: *mut cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateSlicedEll(
+        spMatDescr: *mut cuda_types::cusparse::cusparseSpMatDescr_t,
+        rows: i64,
+        cols: i64,
+        nnz: i64,
+        sellValuesSize: i64,
+        sliceSize: i64,
+        sellSliceOffsets: *mut ::core::ffi::c_void,
+        sellColInd: *mut ::core::ffi::c_void,
+        sellValues: *mut ::core::ffi::c_void,
+        sellSliceOffsetsType: cuda_types::cusparse::cusparseIndexType_t,
+        sellColIndType: cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateConstSlicedEll(
+        spMatDescr: *mut cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        rows: i64,
+        cols: i64,
+        nnz: i64,
+        sellValuesSize: i64,
+        sliceSize: i64,
+        sellSliceOffsets: *const ::core::ffi::c_void,
+        sellColInd: *const ::core::ffi::c_void,
+        sellValues: *const ::core::ffi::c_void,
+        sellSliceOffsetsType: cuda_types::cusparse::cusparseIndexType_t,
+        sellColIndType: cuda_types::cusparse::cusparseIndexType_t,
+        idxBase: cuda_types::cusparse::cusparseIndexBase_t,
+        valueType: cuda_types::cusparse::cudaDataType,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateDnMat(
+        dnMatDescr: *mut cuda_types::cusparse::cusparseDnMatDescr_t,
+        rows: i64,
+        cols: i64,
+        ld: i64,
+        values: *mut ::core::ffi::c_void,
+        valueType: cuda_types::cusparse::cudaDataType,
+        order: cuda_types::cusparse::cusparseOrder_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseCreateConstDnMat(
+        dnMatDescr: *mut cuda_types::cusparse::cusparseConstDnMatDescr_t,
+        rows: i64,
+        cols: i64,
+        ld: i64,
+        values: *const ::core::ffi::c_void,
+        valueType: cuda_types::cusparse::cudaDataType,
+        order: cuda_types::cusparse::cusparseOrder_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDestroyDnMat(
+        dnMatDescr: cuda_types::cusparse::cusparseConstDnMatDescr_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDnMatGet(
+        dnMatDescr: cuda_types::cusparse::cusparseDnMatDescr_t,
+        rows: *mut i64,
+        cols: *mut i64,
+        ld: *mut i64,
+        values: *mut *mut ::core::ffi::c_void,
+        type_: *mut cuda_types::cusparse::cudaDataType,
+        order: *mut cuda_types::cusparse::cusparseOrder_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseConstDnMatGet(
+        dnMatDescr: cuda_types::cusparse::cusparseConstDnMatDescr_t,
+        rows: *mut i64,
+        cols: *mut i64,
+        ld: *mut i64,
+        values: *mut *const ::core::ffi::c_void,
+        type_: *mut cuda_types::cusparse::cudaDataType,
+        order: *mut cuda_types::cusparse::cusparseOrder_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDnMatGetValues(
+        dnMatDescr: cuda_types::cusparse::cusparseDnMatDescr_t,
+        values: *mut *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseConstDnMatGetValues(
+        dnMatDescr: cuda_types::cusparse::cusparseConstDnMatDescr_t,
+        values: *mut *const ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDnMatSetValues(
+        dnMatDescr: cuda_types::cusparse::cusparseDnMatDescr_t,
+        values: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDnMatSetStridedBatch(
+        dnMatDescr: cuda_types::cusparse::cusparseDnMatDescr_t,
+        batchCount: ::core::ffi::c_int,
+        batchStride: i64,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDnMatGetStridedBatch(
+        dnMatDescr: cuda_types::cusparse::cusparseConstDnMatDescr_t,
+        batchCount: *mut ::core::ffi::c_int,
+        batchStride: *mut i64,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseAxpby(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        alpha: *const ::core::ffi::c_void,
+        vecX: cuda_types::cusparse::cusparseConstSpVecDescr_t,
+        beta: *const ::core::ffi::c_void,
+        vecY: cuda_types::cusparse::cusparseDnVecDescr_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseGather(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        vecY: cuda_types::cusparse::cusparseConstDnVecDescr_t,
+        vecX: cuda_types::cusparse::cusparseSpVecDescr_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseScatter(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        vecX: cuda_types::cusparse::cusparseConstSpVecDescr_t,
+        vecY: cuda_types::cusparse::cusparseDnVecDescr_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseRot(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        c_coeff: *const ::core::ffi::c_void,
+        s_coeff: *const ::core::ffi::c_void,
+        vecX: cuda_types::cusparse::cusparseSpVecDescr_t,
+        vecY: cuda_types::cusparse::cusparseDnVecDescr_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpVV_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opX: cuda_types::cusparse::cusparseOperation_t,
+        vecX: cuda_types::cusparse::cusparseConstSpVecDescr_t,
+        vecY: cuda_types::cusparse::cusparseConstDnVecDescr_t,
+        result: *const ::core::ffi::c_void,
+        computeType: cuda_types::cusparse::cudaDataType,
+        bufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpVV(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opX: cuda_types::cusparse::cusparseOperation_t,
+        vecX: cuda_types::cusparse::cusparseConstSpVecDescr_t,
+        vecY: cuda_types::cusparse::cusparseConstDnVecDescr_t,
+        result: *mut ::core::ffi::c_void,
+        computeType: cuda_types::cusparse::cudaDataType,
+        externalBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSparseToDense_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        matA: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        matB: cuda_types::cusparse::cusparseDnMatDescr_t,
+        alg: cuda_types::cusparse::cusparseSparseToDenseAlg_t,
+        bufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSparseToDense(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        matA: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        matB: cuda_types::cusparse::cusparseDnMatDescr_t,
+        alg: cuda_types::cusparse::cusparseSparseToDenseAlg_t,
+        externalBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDenseToSparse_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        matA: cuda_types::cusparse::cusparseConstDnMatDescr_t,
+        matB: cuda_types::cusparse::cusparseSpMatDescr_t,
+        alg: cuda_types::cusparse::cusparseDenseToSparseAlg_t,
+        bufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDenseToSparse_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        matA: cuda_types::cusparse::cusparseConstDnMatDescr_t,
+        matB: cuda_types::cusparse::cusparseSpMatDescr_t,
+        alg: cuda_types::cusparse::cusparseDenseToSparseAlg_t,
+        externalBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseDenseToSparse_convert(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        matA: cuda_types::cusparse::cusparseConstDnMatDescr_t,
+        matB: cuda_types::cusparse::cusparseSpMatDescr_t,
+        alg: cuda_types::cusparse::cusparseDenseToSparseAlg_t,
+        externalBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpMV(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        alpha: *const ::core::ffi::c_void,
+        matA: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        vecX: cuda_types::cusparse::cusparseConstDnVecDescr_t,
+        beta: *const ::core::ffi::c_void,
+        vecY: cuda_types::cusparse::cusparseDnVecDescr_t,
+        computeType: cuda_types::cusparse::cudaDataType,
+        alg: cuda_types::cusparse::cusparseSpMVAlg_t,
+        externalBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpMV_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        alpha: *const ::core::ffi::c_void,
+        matA: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        vecX: cuda_types::cusparse::cusparseConstDnVecDescr_t,
+        beta: *const ::core::ffi::c_void,
+        vecY: cuda_types::cusparse::cusparseDnVecDescr_t,
+        computeType: cuda_types::cusparse::cudaDataType,
+        alg: cuda_types::cusparse::cusparseSpMVAlg_t,
+        bufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpMV_preprocess(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        alpha: *const ::core::ffi::c_void,
+        matA: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        vecX: cuda_types::cusparse::cusparseConstDnVecDescr_t,
+        beta: *const ::core::ffi::c_void,
+        vecY: cuda_types::cusparse::cusparseDnVecDescr_t,
+        computeType: cuda_types::cusparse::cudaDataType,
+        alg: cuda_types::cusparse::cusparseSpMVAlg_t,
+        externalBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpSV_createDescr(
+        descr: *mut cuda_types::cusparse::cusparseSpSVDescr_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpSV_destroyDescr(
+        descr: cuda_types::cusparse::cusparseSpSVDescr_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpSV_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        alpha: *const ::core::ffi::c_void,
+        matA: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        vecX: cuda_types::cusparse::cusparseConstDnVecDescr_t,
+        vecY: cuda_types::cusparse::cusparseDnVecDescr_t,
+        computeType: cuda_types::cusparse::cudaDataType,
+        alg: cuda_types::cusparse::cusparseSpSVAlg_t,
+        spsvDescr: cuda_types::cusparse::cusparseSpSVDescr_t,
+        bufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpSV_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        alpha: *const ::core::ffi::c_void,
+        matA: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        vecX: cuda_types::cusparse::cusparseConstDnVecDescr_t,
+        vecY: cuda_types::cusparse::cusparseDnVecDescr_t,
+        computeType: cuda_types::cusparse::cudaDataType,
+        alg: cuda_types::cusparse::cusparseSpSVAlg_t,
+        spsvDescr: cuda_types::cusparse::cusparseSpSVDescr_t,
+        externalBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpSV_solve(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        alpha: *const ::core::ffi::c_void,
+        matA: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        vecX: cuda_types::cusparse::cusparseConstDnVecDescr_t,
+        vecY: cuda_types::cusparse::cusparseDnVecDescr_t,
+        computeType: cuda_types::cusparse::cudaDataType,
+        alg: cuda_types::cusparse::cusparseSpSVAlg_t,
+        spsvDescr: cuda_types::cusparse::cusparseSpSVDescr_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpSV_updateMatrix(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        spsvDescr: cuda_types::cusparse::cusparseSpSVDescr_t,
+        newValues: *mut ::core::ffi::c_void,
+        updatePart: cuda_types::cusparse::cusparseSpSVUpdate_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpSM_createDescr(
+        descr: *mut cuda_types::cusparse::cusparseSpSMDescr_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpSM_destroyDescr(
+        descr: cuda_types::cusparse::cusparseSpSMDescr_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpSM_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        opB: cuda_types::cusparse::cusparseOperation_t,
+        alpha: *const ::core::ffi::c_void,
+        matA: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        matB: cuda_types::cusparse::cusparseConstDnMatDescr_t,
+        matC: cuda_types::cusparse::cusparseDnMatDescr_t,
+        computeType: cuda_types::cusparse::cudaDataType,
+        alg: cuda_types::cusparse::cusparseSpSMAlg_t,
+        spsmDescr: cuda_types::cusparse::cusparseSpSMDescr_t,
+        bufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpSM_analysis(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        opB: cuda_types::cusparse::cusparseOperation_t,
+        alpha: *const ::core::ffi::c_void,
+        matA: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        matB: cuda_types::cusparse::cusparseConstDnMatDescr_t,
+        matC: cuda_types::cusparse::cusparseDnMatDescr_t,
+        computeType: cuda_types::cusparse::cudaDataType,
+        alg: cuda_types::cusparse::cusparseSpSMAlg_t,
+        spsmDescr: cuda_types::cusparse::cusparseSpSMDescr_t,
+        externalBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpSM_solve(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        opB: cuda_types::cusparse::cusparseOperation_t,
+        alpha: *const ::core::ffi::c_void,
+        matA: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        matB: cuda_types::cusparse::cusparseConstDnMatDescr_t,
+        matC: cuda_types::cusparse::cusparseDnMatDescr_t,
+        computeType: cuda_types::cusparse::cudaDataType,
+        alg: cuda_types::cusparse::cusparseSpSMAlg_t,
+        spsmDescr: cuda_types::cusparse::cusparseSpSMDescr_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpSM_updateMatrix(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        spsmDescr: cuda_types::cusparse::cusparseSpSMDescr_t,
+        newValues: *mut ::core::ffi::c_void,
+        updatePart: cuda_types::cusparse::cusparseSpSMUpdate_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpMM_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        opB: cuda_types::cusparse::cusparseOperation_t,
+        alpha: *const ::core::ffi::c_void,
+        matA: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        matB: cuda_types::cusparse::cusparseConstDnMatDescr_t,
+        beta: *const ::core::ffi::c_void,
+        matC: cuda_types::cusparse::cusparseDnMatDescr_t,
+        computeType: cuda_types::cusparse::cudaDataType,
+        alg: cuda_types::cusparse::cusparseSpMMAlg_t,
+        bufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpMM_preprocess(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        opB: cuda_types::cusparse::cusparseOperation_t,
+        alpha: *const ::core::ffi::c_void,
+        matA: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        matB: cuda_types::cusparse::cusparseConstDnMatDescr_t,
+        beta: *const ::core::ffi::c_void,
+        matC: cuda_types::cusparse::cusparseDnMatDescr_t,
+        computeType: cuda_types::cusparse::cudaDataType,
+        alg: cuda_types::cusparse::cusparseSpMMAlg_t,
+        externalBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpMM(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        opB: cuda_types::cusparse::cusparseOperation_t,
+        alpha: *const ::core::ffi::c_void,
+        matA: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        matB: cuda_types::cusparse::cusparseConstDnMatDescr_t,
+        beta: *const ::core::ffi::c_void,
+        matC: cuda_types::cusparse::cusparseDnMatDescr_t,
+        computeType: cuda_types::cusparse::cudaDataType,
+        alg: cuda_types::cusparse::cusparseSpMMAlg_t,
+        externalBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpGEMM_createDescr(
+        descr: *mut cuda_types::cusparse::cusparseSpGEMMDescr_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpGEMM_destroyDescr(
+        descr: cuda_types::cusparse::cusparseSpGEMMDescr_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpGEMM_workEstimation(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        opB: cuda_types::cusparse::cusparseOperation_t,
+        alpha: *const ::core::ffi::c_void,
+        matA: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        matB: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        beta: *const ::core::ffi::c_void,
+        matC: cuda_types::cusparse::cusparseSpMatDescr_t,
+        computeType: cuda_types::cusparse::cudaDataType,
+        alg: cuda_types::cusparse::cusparseSpGEMMAlg_t,
+        spgemmDescr: cuda_types::cusparse::cusparseSpGEMMDescr_t,
+        bufferSize1: *mut usize,
+        externalBuffer1: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpGEMM_getNumProducts(
+        spgemmDescr: cuda_types::cusparse::cusparseSpGEMMDescr_t,
+        num_prods: *mut i64,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpGEMM_estimateMemory(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        opB: cuda_types::cusparse::cusparseOperation_t,
+        alpha: *const ::core::ffi::c_void,
+        matA: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        matB: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        beta: *const ::core::ffi::c_void,
+        matC: cuda_types::cusparse::cusparseSpMatDescr_t,
+        computeType: cuda_types::cusparse::cudaDataType,
+        alg: cuda_types::cusparse::cusparseSpGEMMAlg_t,
+        spgemmDescr: cuda_types::cusparse::cusparseSpGEMMDescr_t,
+        chunk_fraction: f32,
+        bufferSize3: *mut usize,
+        externalBuffer3: *mut ::core::ffi::c_void,
+        bufferSize2: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpGEMM_compute(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        opB: cuda_types::cusparse::cusparseOperation_t,
+        alpha: *const ::core::ffi::c_void,
+        matA: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        matB: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        beta: *const ::core::ffi::c_void,
+        matC: cuda_types::cusparse::cusparseSpMatDescr_t,
+        computeType: cuda_types::cusparse::cudaDataType,
+        alg: cuda_types::cusparse::cusparseSpGEMMAlg_t,
+        spgemmDescr: cuda_types::cusparse::cusparseSpGEMMDescr_t,
+        bufferSize2: *mut usize,
+        externalBuffer2: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpGEMM_copy(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        opB: cuda_types::cusparse::cusparseOperation_t,
+        alpha: *const ::core::ffi::c_void,
+        matA: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        matB: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        beta: *const ::core::ffi::c_void,
+        matC: cuda_types::cusparse::cusparseSpMatDescr_t,
+        computeType: cuda_types::cusparse::cudaDataType,
+        alg: cuda_types::cusparse::cusparseSpGEMMAlg_t,
+        spgemmDescr: cuda_types::cusparse::cusparseSpGEMMDescr_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpGEMMreuse_workEstimation(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        opB: cuda_types::cusparse::cusparseOperation_t,
+        matA: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        matB: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        matC: cuda_types::cusparse::cusparseSpMatDescr_t,
+        alg: cuda_types::cusparse::cusparseSpGEMMAlg_t,
+        spgemmDescr: cuda_types::cusparse::cusparseSpGEMMDescr_t,
+        bufferSize1: *mut usize,
+        externalBuffer1: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpGEMMreuse_nnz(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        opB: cuda_types::cusparse::cusparseOperation_t,
+        matA: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        matB: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        matC: cuda_types::cusparse::cusparseSpMatDescr_t,
+        alg: cuda_types::cusparse::cusparseSpGEMMAlg_t,
+        spgemmDescr: cuda_types::cusparse::cusparseSpGEMMDescr_t,
+        bufferSize2: *mut usize,
+        externalBuffer2: *mut ::core::ffi::c_void,
+        bufferSize3: *mut usize,
+        externalBuffer3: *mut ::core::ffi::c_void,
+        bufferSize4: *mut usize,
+        externalBuffer4: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpGEMMreuse_copy(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        opB: cuda_types::cusparse::cusparseOperation_t,
+        matA: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        matB: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        matC: cuda_types::cusparse::cusparseSpMatDescr_t,
+        alg: cuda_types::cusparse::cusparseSpGEMMAlg_t,
+        spgemmDescr: cuda_types::cusparse::cusparseSpGEMMDescr_t,
+        bufferSize5: *mut usize,
+        externalBuffer5: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpGEMMreuse_compute(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        opB: cuda_types::cusparse::cusparseOperation_t,
+        alpha: *const ::core::ffi::c_void,
+        matA: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        matB: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        beta: *const ::core::ffi::c_void,
+        matC: cuda_types::cusparse::cusparseSpMatDescr_t,
+        computeType: cuda_types::cusparse::cudaDataType,
+        alg: cuda_types::cusparse::cusparseSpGEMMAlg_t,
+        spgemmDescr: cuda_types::cusparse::cusparseSpGEMMDescr_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSDDMM_bufferSize(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        opB: cuda_types::cusparse::cusparseOperation_t,
+        alpha: *const ::core::ffi::c_void,
+        matA: cuda_types::cusparse::cusparseConstDnMatDescr_t,
+        matB: cuda_types::cusparse::cusparseConstDnMatDescr_t,
+        beta: *const ::core::ffi::c_void,
+        matC: cuda_types::cusparse::cusparseSpMatDescr_t,
+        computeType: cuda_types::cusparse::cudaDataType,
+        alg: cuda_types::cusparse::cusparseSDDMMAlg_t,
+        bufferSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSDDMM_preprocess(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        opB: cuda_types::cusparse::cusparseOperation_t,
+        alpha: *const ::core::ffi::c_void,
+        matA: cuda_types::cusparse::cusparseConstDnMatDescr_t,
+        matB: cuda_types::cusparse::cusparseConstDnMatDescr_t,
+        beta: *const ::core::ffi::c_void,
+        matC: cuda_types::cusparse::cusparseSpMatDescr_t,
+        computeType: cuda_types::cusparse::cudaDataType,
+        alg: cuda_types::cusparse::cusparseSDDMMAlg_t,
+        externalBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSDDMM(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        opB: cuda_types::cusparse::cusparseOperation_t,
+        alpha: *const ::core::ffi::c_void,
+        matA: cuda_types::cusparse::cusparseConstDnMatDescr_t,
+        matB: cuda_types::cusparse::cusparseConstDnMatDescr_t,
+        beta: *const ::core::ffi::c_void,
+        matC: cuda_types::cusparse::cusparseSpMatDescr_t,
+        computeType: cuda_types::cusparse::cudaDataType,
+        alg: cuda_types::cusparse::cusparseSDDMMAlg_t,
+        externalBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpMMOp_createPlan(
+        handle: cuda_types::cusparse::cusparseHandle_t,
+        plan: *mut cuda_types::cusparse::cusparseSpMMOpPlan_t,
+        opA: cuda_types::cusparse::cusparseOperation_t,
+        opB: cuda_types::cusparse::cusparseOperation_t,
+        matA: cuda_types::cusparse::cusparseConstSpMatDescr_t,
+        matB: cuda_types::cusparse::cusparseConstDnMatDescr_t,
+        matC: cuda_types::cusparse::cusparseDnMatDescr_t,
+        computeType: cuda_types::cusparse::cudaDataType,
+        alg: cuda_types::cusparse::cusparseSpMMOpAlg_t,
+        addOperationNvvmBuffer: *const ::core::ffi::c_void,
+        addOperationBufferSize: usize,
+        mulOperationNvvmBuffer: *const ::core::ffi::c_void,
+        mulOperationBufferSize: usize,
+        epilogueNvvmBuffer: *const ::core::ffi::c_void,
+        epilogueBufferSize: usize,
+        SpMMWorkspaceSize: *mut usize,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpMMOp(
+        plan: cuda_types::cusparse::cusparseSpMMOpPlan_t,
+        externalBuffer: *mut ::core::ffi::c_void,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+    #[must_use]
+    fn cusparseSpMMOp_destroyPlan(
+        plan: cuda_types::cusparse::cusparseSpMMOpPlan_t,
+    ) -> cuda_types::cusparse::cusparseStatus_t;
+}
diff --git a/cuda_base/src/lib.rs b/cuda_base/src/lib.rs
index 58f5eae..88db912 100644
--- a/cuda_base/src/lib.rs
+++ b/cuda_base/src/lib.rs
@@ -15,6 +15,11 @@ use syn::{
 
 const CUDA_RS: &'static str = include_str! {"cuda.rs"};
 const NVML_RS: &'static str = include_str! {"nvml.rs"};
+const CUBLAS_RS: &'static str = include_str! {"cublas.rs"};
+const CUBLASLT_RS: &'static str = include_str! {"cublaslt.rs"};
+const CUFFT_RS: &'static str = include_str! {"cufft.rs"};
+const CUSPARSE_RS: &'static str = include_str! {"cusparse.rs"};
+const CUDNN9_RS: &'static str = include_str! {"cudnn9.rs"};
 
 // This macro accepts following arguments:
 // * `normal_macro`: ident for a normal macro
@@ -35,6 +40,31 @@ pub fn cuda_function_declarations(tokens: TokenStream) -> TokenStream {
     function_declarations(tokens, CUDA_RS)
 }
 
+#[proc_macro]
+pub fn cublas_function_declarations(tokens: TokenStream) -> TokenStream {
+    function_declarations(tokens, CUBLAS_RS)
+}
+
+#[proc_macro]
+pub fn cublaslt_function_declarations(tokens: TokenStream) -> TokenStream {
+    function_declarations(tokens, CUBLASLT_RS)
+}
+
+#[proc_macro]
+pub fn cufft_function_declarations(tokens: TokenStream) -> TokenStream {
+    function_declarations(tokens, CUFFT_RS)
+}
+
+#[proc_macro]
+pub fn cusparse_function_declarations(tokens: TokenStream) -> TokenStream {
+    function_declarations(tokens, CUSPARSE_RS)
+}
+
+#[proc_macro]
+pub fn cudnn9_function_declarations(tokens: TokenStream) -> TokenStream {
+    function_declarations(tokens, CUDNN9_RS)
+}
+
 fn function_declarations(tokens: TokenStream, module: &str) -> TokenStream {
     let input = parse_macro_input!(tokens as FnDeclInput);
     let mut cuda_module = syn::parse_str::<File>(module).unwrap();
diff --git a/cuda_base/src/nvml.rs b/cuda_base/src/nvml.rs
index b89ef7a..df28038 100644
--- a/cuda_base/src/nvml.rs
+++ b/cuda_base/src/nvml.rs
@@ -208,6 +208,27 @@ extern "system" {
         deviceArray: *mut cuda_types::nvml::nvmlDevice_t,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
+    /** Retrieves the driver branch of the NVIDIA driver installed on the system.
+
+ For all products.
+
+ The branch identifier is an alphanumeric string.  It will not exceed 80 characters in length
+ (including the NULL terminator).  See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE.
+
+ @param branchInfo                            Pointer to the driver branch information structure \a nvmlSystemDriverBranchInfo_t
+ @param length                                The maximum allowed length of the driver branch string
+
+ @return
+         - \ref NVML_SUCCESS                 successful completion
+         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a branchInfo is NULL
+         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
+         - \ref NVML_ERROR_UNKNOWN           on any unexpected error*/
+    fn nvmlSystemGetDriverBranch(
+        branchInfo: *mut cuda_types::nvml::nvmlSystemDriverBranchInfo_t,
+        length: ::core::ffi::c_uint,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
     /** Retrieves the number of units in the system.
 
  For S-class products.
@@ -664,6 +685,19 @@ extern "system" {
         length: ::core::ffi::c_uint,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
+    /** Get a unique identifier for the device module on the baseboard
+
+ This API retrieves a unique identifier for each GPU module that exists on a given baseboard.
+ For non-baseboard products, this ID would always be 0.
+
+ @param device                               The identifier of the target device
+ @param moduleId                             Unique identifier for the GPU module
+
+ @return
+         - \ref NVML_SUCCESS                 if \a moduleId has been successfully retrieved
+         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a moduleId is invalid
+         - \ref NVML_ERROR_UNKNOWN           on any unexpected error*/
     fn nvmlDeviceGetModuleId(
         device: cuda_types::nvml::nvmlDevice_t,
         moduleId: *mut ::core::ffi::c_uint,
@@ -1153,7 +1187,7 @@ extern "system" {
 
  For all products.
 
- See \ref nvmlPciInfoExt_t for details on the available PCI info.
+ See \ref nvmlPciInfoExt_v1_t for details on the available PCI info.
 
  @param device                               The identifier of the target device
  @param pci                                  Reference in which to return the PCI info
@@ -1626,6 +1660,31 @@ extern "system" {
         speed: *mut ::core::ffi::c_uint,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
+    /** Retrieves the intended operating speed in rotations per minute (RPM) of the device's specified fan.
+
+ For Maxwell &tm; or newer fully supported devices.
+
+ For all discrete products with dedicated fans.
+
+ Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the
+ output will not match the actual fan speed.
+
+ @param device                               The identifier of the target device
+ @param fanSpeed                             Structure specifying the index of the target fan (input) and
+                                             retrieved fan speed value (output)
+
+ @return
+         - \ref NVML_SUCCESS                         If everything worked
+         - \ref NVML_ERROR_UNINITIALIZED             If the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT          If \a device is invalid, \a fan is not an acceptable
+                                                          index, or \a speed is NULL
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the provided version is invalid/unsupported
+         - \ref NVML_ERROR_NOT_SUPPORTED             If the \a device does not support this feature*/
+    fn nvmlDeviceGetFanSpeedRPM(
+        device: cuda_types::nvml::nvmlDevice_t,
+        fanSpeed: *mut cuda_types::nvml::nvmlFanSpeedInfo_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
     /** Retrieves the intended target speed of the device's specified fan.
 
  Normally, the driver dynamically adjusts the fan based on
@@ -1718,29 +1777,58 @@ extern "system" {
         numFans: *mut ::core::ffi::c_uint,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
-    /** Retrieves the current temperature readings for the device, in degrees C.
-
- For all products.
-
- See \ref nvmlTemperatureSensors_t for details on available temperature sensors.
-
- @param device                               The identifier of the target device
- @param sensorType                           Flag that indicates which sensor reading to retrieve
- @param temp                                 Reference in which to return the temperature reading
-
- @return
-         - \ref NVML_SUCCESS                 if \a temp has been set
-         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
-         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a sensorType is invalid or \a temp is NULL
-         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have the specified sensor
-         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
-         - \ref NVML_ERROR_UNKNOWN           on any unexpected error*/
+    /// @deprecated Use \ref nvmlDeviceGetTemperatureV instead
     fn nvmlDeviceGetTemperature(
         device: cuda_types::nvml::nvmlDevice_t,
         sensorType: cuda_types::nvml::nvmlTemperatureSensors_t,
         temp: *mut ::core::ffi::c_uint,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
+    /** Retrieves the cooler's information.
+ Returns a cooler's control signal characteristics.  The possible types are restricted, Variable and Toggle.
+ See \ref nvmlCoolerControl_t for details on available signal types.
+ Returns objects that cooler cools. Targets may be GPU, Memory, Power Supply or All of these.
+ See \ref nvmlCoolerTarget_t for details on available targets.
+
+ For Maxwell &tm; or newer fully supported devices.
+
+ For all discrete products with dedicated fans.
+
+ @param[in]  device                               The identifier of the target device
+ @param[out] coolerInfo                           Structure specifying the cooler's control signal characteristics (out)
+                                                  and the target that cooler cools (out)
+
+ @return
+         - \ref NVML_SUCCESS                         If everything worked
+         - \ref NVML_ERROR_UNINITIALIZED             If the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT          If \a device is invalid, \a signalType or \a target is NULL
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the provided version is invalid/unsupported
+         - \ref NVML_ERROR_NOT_SUPPORTED             If the \a device does not support this feature*/
+    fn nvmlDeviceGetCoolerInfo(
+        device: cuda_types::nvml::nvmlDevice_t,
+        coolerInfo: *mut cuda_types::nvml::nvmlCoolerInfo_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
+    /** Retrieves the current temperature readings (in degrees C) for the given device.
+
+ For all products.
+
+ @param[in]       device                      Target device identifier.
+ @param[in,out]   temperature                 Structure specifying the sensor type (input) and retrieved
+                                              temperature value (output).
+
+ @return
+         - \ref NVML_SUCCESS                  if \a temp has been set
+         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a device is invalid, \a sensorType is invalid or \a temp is NULL
+         - \ref NVML_ERROR_NOT_SUPPORTED      if the device does not have the specified sensor
+         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
+         - \ref NVML_ERROR_UNKNOWN            on any unexpected error*/
+    fn nvmlDeviceGetTemperatureV(
+        device: cuda_types::nvml::nvmlDevice_t,
+        temperature: *mut cuda_types::nvml::nvmlTemperature_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
     /** Retrieves the temperature threshold for the GPU with the specified threshold type in degrees C.
 
  For Kepler &tm; or newer fully supported devices.
@@ -1771,6 +1859,23 @@ extern "system" {
         temp: *mut ::core::ffi::c_uint,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
+    /** Retrieves the thermal margin temperature (distance to nearest slowdown threshold).
+
+ @param[in]     device                                The identifier of the target device
+ @param[in,out] marginTempInfo                        Versioned structure in which to return the temperature reading
+
+ @returns
+         - \ref NVML_SUCCESS                           if the margin temperature was retrieved successfully
+         - \ref NVML_ERROR_NOT_SUPPORTED               if request is not supported on the current platform
+         - \ref NVML_ERROR_INVALID_ARGUMENT            if \a device is invalid or \a temperature is NULL
+         - \ref NVML_ERROR_GPU_IS_LOST                 if the target GPU has fallen off the bus or is otherwise inaccessible
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH   if the right versioned structure is not used
+         - \ref NVML_ERROR_UNKNOWN                     on any unexpected error*/
+    fn nvmlDeviceGetMarginTemperature(
+        device: cuda_types::nvml::nvmlDevice_t,
+        marginTempInfo: *mut cuda_types::nvml::nvmlMarginTemperature_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
     /** Used to execute a list of thermal system instructions.
 
  @param device                               The identifier of the target device
@@ -2011,6 +2116,163 @@ extern "system" {
         maxOffset: *mut ::core::ffi::c_int,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
+    /** Retrieve min, max and current clock offset of some clock domain for a given PState
+
+ For Maxwell &tm; or newer fully supported devices.
+
+ Note: \ref nvmlDeviceGetGpcClkVfOffset, \ref nvmlDeviceGetMemClkVfOffset, \ref nvmlDeviceGetGpcClkMinMaxVfOffset and
+       \ref nvmlDeviceGetMemClkMinMaxVfOffset will be deprecated in a future release.
+Use \ref nvmlDeviceGetClockOffsets instead.
+
+ @param device                               The identifier of the target device
+ @param info                                 Structure specifying the clock type (input) and the pstate (input)
+                                             retrieved clock offset value (output), min clock offset (output)
+                                             and max clock offset (output)
+
+ @return
+         - \ref NVML_SUCCESS                         If everything worked
+         - \ref NVML_ERROR_UNINITIALIZED             If the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT          If \a device, \a type or \a pstate are invalid or both
+                                                             \a minClockOffsetMHz and \a maxClockOffsetMHz are NULL
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the provided version is invalid/unsupported
+         - \ref NVML_ERROR_NOT_SUPPORTED             If the device does not support this feature*/
+    fn nvmlDeviceGetClockOffsets(
+        device: cuda_types::nvml::nvmlDevice_t,
+        info: *mut cuda_types::nvml::nvmlClockOffset_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
+    /** Control current clock offset of some clock domain for a given PState
+
+ For Maxwell &tm; or newer fully supported devices.
+
+ Requires privileged user.
+
+ @param device                               The identifier of the target device
+ @param info                                 Structure specifying the clock type (input), the pstate (input)
+                                             and clock offset value (input)
+
+ @return
+         - \ref NVML_SUCCESS                         If everything worked
+         - \ref NVML_ERROR_UNINITIALIZED             If the library has not been successfully initialized
+         - \ref NVML_ERROR_NO_PERMISSION             If the user doesn't have permission to perform this operation
+         - \ref NVML_ERROR_INVALID_ARGUMENT          If \a device, \a type or \a pstate are invalid or both
+                                                             \a clockOffsetMHz is out of allowed range.
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the provided version is invalid/unsupported
+         - \ref NVML_ERROR_NOT_SUPPORTED             If the device does not support this feature*/
+    fn nvmlDeviceSetClockOffsets(
+        device: cuda_types::nvml::nvmlDevice_t,
+        info: *mut cuda_types::nvml::nvmlClockOffset_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
+    /** Retrieves a performance mode string with all the
+ performance modes defined for this device along with their associated
+ GPU Clock and Memory Clock values.
+ Not all tokens will be reported on all GPUs, and additional tokens
+ may be added in the future.
+ For backwards compatibility we still provide nvclock and memclock;
+ those are the same as nvclockmin and memclockmin.
+
+ Note: These clock values take into account the offset
+ set by clients through /ref nvmlDeviceSetClockOffsets.
+
+ Maximum available Pstate (P15) shows the minimum performance level (0) and vice versa.
+
+ Each performance modes are returned as a comma-separated list of
+ "token=value" pairs.  Each set of performance mode tokens are separated
+ by a ";".  Valid tokens:
+
+    Token                    Value
+   "perf"                    unsigned int   - the Performance level
+   "nvclock"                 unsigned int   - the GPU clocks (in MHz) for the perf level
+   "nvclockmin"              unsigned int   - the GPU clocks min (in MHz) for the perf level
+   "nvclockmax"              unsigned int   - the GPU clocks max (in MHz) for the perf level
+   "nvclockeditable"         unsigned int   - if the GPU clock domain is editable for the perf level
+   "memclock"                unsigned int   - the memory clocks (in MHz) for the perf level
+   "memclockmin"             unsigned int   - the memory clocks min (in MHz) for the perf level
+   "memclockmax"             unsigned int   - the memory clocks max (in MHz) for the perf level
+   "memclockeditable"        unsigned int   - if the memory clock domain is editable for the perf level
+   "memtransferrate"         unsigned int   - the memory transfer rate (in MHz) for the perf level
+   "memtransferratemin"      unsigned int   - the memory transfer rate min (in MHz) for the perf level
+   "memtransferratemax"      unsigned int   - the memory transfer rate max (in MHz) for the perf level
+   "memtransferrateeditable" unsigned int   - if the memory transfer rate is editable for the perf level
+
+ Example:
+
+ perf=0, nvclock=324, nvclockmin=324, nvclockmax=324, nvclockeditable=0,
+ memclock=324, memclockmin=324, memclockmax=324, memclockeditable=0,
+ memtransferrate=648, memtransferratemin=648, memtransferratemax=648,
+ memtransferrateeditable=0 ;
+ perf=1, nvclock=324, nvclockmin=324, nvclockmax=640, nvclockeditable=0,
+ memclock=810, memclockmin=810, memclockmax=810, memclockeditable=0,
+ memtransferrate=1620, memtransferrate=1620, memtransferrate=1620,
+ memtransferrateeditable=0 ;
+
+
+ @param device                               The identifier of the target device
+ @param perfModes                            Reference in which to return the performance level string
+
+ @return
+         - \ref NVML_SUCCESS                 if \a perfModes has been set
+         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a name is NULL
+         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
+         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+         - \ref NVML_ERROR_UNKNOWN           on any unexpected error*/
+    fn nvmlDeviceGetPerformanceModes(
+        device: cuda_types::nvml::nvmlDevice_t,
+        perfModes: *mut cuda_types::nvml::nvmlDevicePerfModes_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
+    /** Retrieves a string with the associated current GPU Clock and Memory Clock values.
+
+ Not all tokens will be reported on all GPUs, and additional tokens
+ may be added in the future.
+
+ Note: These clock values take into account the offset
+ set by clients through /ref nvmlDeviceSetClockOffsets.
+
+ Clock values are returned as a comma-separated list of
+ "token=value" pairs.
+ Valid tokens:
+
+    Token                    Value
+   "perf"                    unsigned int   - the Performance level
+   "nvclock"                 unsigned int   - the GPU clocks (in MHz) for the perf level
+   "nvclockmin"              unsigned int   - the GPU clocks min (in MHz) for the perf level
+   "nvclockmax"              unsigned int   - the GPU clocks max (in MHz) for the perf level
+   "nvclockeditable"         unsigned int   - if the GPU clock domain is editable for the perf level
+   "memclock"                unsigned int   - the memory clocks (in MHz) for the perf level
+   "memclockmin"             unsigned int   - the memory clocks min (in MHz) for the perf level
+   "memclockmax"             unsigned int   - the memory clocks max (in MHz) for the perf level
+   "memclockeditable"        unsigned int   - if the memory clock domain is editable for the perf level
+   "memtransferrate"         unsigned int   - the memory transfer rate (in MHz) for the perf level
+   "memtransferratemin"      unsigned int   - the memory transfer rate min (in MHz) for the perf level
+   "memtransferratemax"      unsigned int   - the memory transfer rate max (in MHz) for the perf level
+   "memtransferrateeditable" unsigned int   - if the memory transfer rate is editable for the perf level
+
+ Example:
+
+ nvclock=324, nvclockmin=324, nvclockmax=324, nvclockeditable=0,
+ memclock=324, memclockmin=324, memclockmax=324, memclockeditable=0,
+ memtransferrate=648, memtransferratemin=648, memtransferratemax=648,
+ memtransferrateeditable=0 ;
+
+
+ @param device                               The identifier of the target device
+ @param currentClockFreqs                    Reference in which to return the performance level string
+
+ @return
+         - \ref NVML_SUCCESS                 if \a currentClockFreqs has been set
+         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a name is NULL
+         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
+         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+         - \ref NVML_ERROR_UNKNOWN           on any unexpected error*/
+    fn nvmlDeviceGetCurrentClockFreqs(
+        device: cuda_types::nvml::nvmlDevice_t,
+        currentClockFreqs: *mut cuda_types::nvml::nvmlDeviceCurrentClockFreqs_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
     /** This API has been deprecated.
 
  Retrieves the power management mode associated with this device.
@@ -2247,6 +2509,7 @@ extern "system" {
         memory: *mut cuda_types::nvml::nvmlMemory_t,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
+    /// nvmlDeviceGetMemoryInfo_v2 accounts separately for reserved memory and includes it in the used memory amount.
     fn nvmlDeviceGetMemoryInfo_v2(
         device: cuda_types::nvml::nvmlDevice_t,
         memory: *mut cuda_types::nvml::nvmlMemory_v2_t,
@@ -2301,6 +2564,69 @@ extern "system" {
         minor: *mut ::core::ffi::c_int,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
+    /** Retrieves the current and pending DRAM Encryption modes for the device.
+
+ %BLACKWELL_OR_NEWER%
+ Only applicable to devices that support DRAM Encryption
+ Requires \a NVML_INFOROM_DEN version 1.0 or higher.
+
+ Changing DRAM Encryption modes requires a reboot. The "pending" DRAM Encryption mode refers to the target mode following
+ the next reboot.
+
+ See \ref nvmlEnableState_t for details on allowed modes.
+
+ @param device                               The identifier of the target device
+ @param current                              Reference in which to return the current DRAM Encryption mode
+ @param pending                              Reference in which to return the pending DRAM Encryption mode
+
+ @return
+         - \ref NVML_SUCCESS                         if \a current and \a pending have been set
+         - \ref NVML_ERROR_UNINITIALIZED             if the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT          if \a device is invalid or either \a current or \a pending is NULL
+         - \ref NVML_ERROR_NOT_SUPPORTED             if the device does not support this feature
+         - \ref NVML_ERROR_GPU_IS_LOST               if the target GPU has fallen off the bus or is otherwise inaccessible
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the argument version is not supported
+         - \ref NVML_ERROR_UNKNOWN                   on any unexpected error
+
+ @see nvmlDeviceSetDramEncryptionMode()*/
+    fn nvmlDeviceGetDramEncryptionMode(
+        device: cuda_types::nvml::nvmlDevice_t,
+        current: *mut cuda_types::nvml::nvmlDramEncryptionInfo_t,
+        pending: *mut cuda_types::nvml::nvmlDramEncryptionInfo_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
+    /** Set the DRAM Encryption mode for the device.
+
+ For Kepler &tm; or newer fully supported devices.
+ Only applicable to devices that support DRAM Encryption.
+ Requires \a NVML_INFOROM_DEN version 1.0 or higher.
+ Requires root/admin permissions.
+
+ The DRAM Encryption mode determines whether the GPU enables its DRAM Encryption support.
+
+ This operation takes effect after the next reboot.
+
+ See \ref nvmlEnableState_t for details on available modes.
+
+ @param device                               The identifier of the target device
+ @param dramEncryption                       The target DRAM Encryption mode
+
+ @return
+         - \ref NVML_SUCCESS                         if the DRAM Encryption mode was set
+         - \ref NVML_ERROR_UNINITIALIZED             if the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT          if \a device is invalid or \a DRAM Encryption is invalid
+         - \ref NVML_ERROR_NOT_SUPPORTED             if the device does not support this feature
+         - \ref NVML_ERROR_NO_PERMISSION             if the user doesn't have permission to perform this operation
+         - \ref NVML_ERROR_GPU_IS_LOST               if the target GPU has fallen off the bus or is otherwise inaccessible
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the argument version is not supported
+         - \ref NVML_ERROR_UNKNOWN                   on any unexpected error
+
+ @see nvmlDeviceGetDramEncryptionMode()*/
+    fn nvmlDeviceSetDramEncryptionMode(
+        device: cuda_types::nvml::nvmlDevice_t,
+        dramEncryption: *const cuda_types::nvml::nvmlDramEncryptionInfo_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
     /** Retrieves the current and pending ECC modes for the device.
 
  For Fermi &tm; or newer fully supported devices.
@@ -2766,11 +3092,11 @@ extern "system" {
     #[must_use]
     /** Retrieves the current and pending driver model for the device.
 
- For Fermi &tm; or newer fully supported devices.
+ For Kepler &tm; or newer fully supported devices.
  For windows only.
 
- On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached
- to the device it must run in WDDM mode. TCC mode is preferred if a display is not attached.
+ On Windows platforms the device driver can run in either WDDM, MCDM or WDM (TCC) modes. If a display is attached
+ to the device it must run in WDDM mode. MCDM mode is preferred if a display is not attached. TCC mode is deprecated.
 
  See \ref nvmlDriverModel_t for details on available driver models.
 
@@ -2786,8 +3112,8 @@ extern "system" {
          - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
          - \ref NVML_ERROR_UNKNOWN           on any unexpected error
 
- @see nvmlDeviceSetDriverModel()*/
-    fn nvmlDeviceGetDriverModel(
+ @see nvmlDeviceSetDriverModel_v2()*/
+    fn nvmlDeviceGetDriverModel_v2(
         device: cuda_types::nvml::nvmlDevice_t,
         current: *mut cuda_types::nvml::nvmlDriverModel_t,
         pending: *mut cuda_types::nvml::nvmlDriverModel_t,
@@ -2928,7 +3254,7 @@ extern "system" {
         infos: *mut cuda_types::nvml::nvmlProcessInfo_t,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
-    /** Get information about processes with a MPS compute context on a device
+    /** Get information about processes with a Multi-Process Service (MPS) compute context on a device
 
  For Volta &tm; or newer fully supported devices.
 
@@ -2975,17 +3301,17 @@ extern "system" {
     #[must_use]
     /** Get information about running processes on a device for input context
 
- %HOPPER_OR_NEWER%
+ For Hopper &tm; or newer fully supported devices.
 
  This function returns information only about running processes (e.g. CUDA application which have
  active context).
 
- To determine the size of the @ref plist->procArray array to allocate, call the function with
- @ref plist->numProcArrayEntries set to zero and @ref plist->procArray set to NULL. The return
+ To determine the size of the \a plist->procArray array to allocate, call the function with
+ \a plist->numProcArrayEntries set to zero and \a plist->procArray set to NULL. The return
  code will be either NVML_ERROR_INSUFFICIENT_SIZE (if there are valid processes of type
- @ref plist->mode to report on, in which case the @ref plist->numProcArrayEntries field will
+ \a plist->mode to report on, in which case the \a plist->numProcArrayEntries field will
  indicate the required number of entries in the array) or NVML_SUCCESS (if no processes of type
- @ref plist->mode exist).
+ \a plist->mode exist).
 
  The usedGpuMemory field returned is all of the memory used by the application.
  The usedGpuCcProtectedMemory field returned is all of the protected memory used by the application.
@@ -3002,10 +3328,10 @@ extern "system" {
 
  @param device                               The device handle or MIG device handle
  @param plist                                Reference in which to process detail list
- @param plist->version                       The api version
- @param plist->mode                          The process mode
- @param plist->procArray                     Reference in which to return the process information
- @param plist->numProcArrayEntries           Proc array size of returned entries
+ \a plist->version                       The api version
+ \a plist->mode                          The process mode
+ \a plist->procArray                     Reference in which to return the process information
+ \a plist->numProcArrayEntries           Proc array size of returned entries
 
  @return
          - \ref NVML_SUCCESS                 if \a plist->numprocArrayEntries and \a plist->procArray have been populated
@@ -3203,7 +3529,7 @@ extern "system" {
  @param numCores                             The number of cores for the specified device
 
  @return
-         - \ref NVML_SUCCESS                 if Gpu core count is successfully retrieved
+         - \ref NVML_SUCCESS                 if GPU core count is successfully retrieved
          - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
          - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a numCores is NULL
          - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
@@ -3254,7 +3580,7 @@ extern "system" {
  @param maxSpeed                             The devices's PCIE Max Link speed in MBPS
 
  @return
-         - \ref NVML_SUCCESS                 if Pcie Max Link Speed is successfully retrieved
+         - \ref NVML_SUCCESS                 if PCIe Max Link Speed is successfully retrieved
          - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
          - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a maxSpeed is NULL
          - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
@@ -3285,8 +3611,8 @@ extern "system" {
 
  @param device                               The identifier of the target device
  @param adaptiveClockStatus                  The current adaptive clocking status, either
-                                             @ref NVML_ADAPTIVE_CLOCKING_INFO_STATUS_DISABLED
-                                             or @ref NVML_ADAPTIVE_CLOCKING_INFO_STATUS_ENABLED
+                                             NVML_ADAPTIVE_CLOCKING_INFO_STATUS_DISABLED
+                                             or NVML_ADAPTIVE_CLOCKING_INFO_STATUS_ENABLED
 
  @return
          - \ref NVML_SUCCESS                 if the current adaptive clocking status is successfully retrieved
@@ -3308,7 +3634,7 @@ extern "system" {
  return
          - \ref NVML_SUCCESS                 if the bus \a type is successfully retreived
          - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
-         - \ref NVML_ERROR_INVALID_ARGUMENT  if \device is invalid or \type is NULL
+         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a type is NULL
          - \ref NVML_ERROR_UNKNOWN           on any unexpected error*/
     fn nvmlDeviceGetBusType(
         device: cuda_types::nvml::nvmlDevice_t,
@@ -3319,7 +3645,7 @@ extern "system" {
 
  Get fabric information associated with the device.
 
- %HOPPER_OR_NEWER%
+ For Hopper &tm; or newer fully supported devices.
 
  On Hopper + NVSwitch systems, GPU is registered with the NVIDIA Fabric Manager
  Upon successful registration, the GPU is added to the NVLink fabric to enable
@@ -3350,7 +3676,7 @@ extern "system" {
      nvmlReturn_t result = nvmlDeviceGetGpuFabricInfoV(device,&fabricInfo);
  \endcode
 
- %HOPPER_OR_NEWER%
+ For Hopper &tm; or newer fully supported devices.
 
  @param device                               The identifier of the target device
  @param gpuFabricInfo                        Information about GPU fabric state
@@ -3450,7 +3776,7 @@ extern "system" {
         memory: *mut cuda_types::nvml::nvmlMemory_t,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
-    /** Get Conf Computing Gpu certificate details.
+    /** Get Conf Computing GPU certificate details.
 
  For Ampere &tm; or newer fully supported devices.
  Supported on Linux, Windows TCC.
@@ -3469,7 +3795,7 @@ extern "system" {
         gpuCert: *mut cuda_types::nvml::nvmlConfComputeGpuCertificate_t,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
-    /** Get Conf Computing Gpu attestation report.
+    /** Get Conf Computing GPU attestation report.
 
  For Ampere &tm; or newer fully supported devices.
  Supported on Linux, Windows TCC.
@@ -3490,7 +3816,7 @@ extern "system" {
     #[must_use]
     /** Get Conf Computing key rotation threshold detail.
 
- %HOPPER_OR_NEWER%
+ For Hopper &tm; or newer fully supported devices.
  Supported on Linux, Windows TCC.
 
  @param pKeyRotationThrInfo                  Reference in which to return the key rotation threshold data
@@ -3505,21 +3831,79 @@ extern "system" {
         pKeyRotationThrInfo: *mut cuda_types::nvml::nvmlConfComputeGetKeyRotationThresholdInfo_t,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
+    /** Set Conf Computing Unprotected Memory Size.
+
+ For Ampere &tm; or newer fully supported devices.
+ Supported on Linux, Windows TCC.
+
+ @param device                               Device Handle
+ @param sizeKiB                              Unprotected Memory size to be set in KiB
+
+ @return
+         - \ref NVML_SUCCESS                 if \a sizeKiB successfully set
+         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
+         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device*/
+    fn nvmlDeviceSetConfComputeUnprotectedMemSize(
+        device: cuda_types::nvml::nvmlDevice_t,
+        sizeKiB: ::core::ffi::c_ulonglong,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
+    /** Set Conf Computing GPUs ready state.
+
+ For Ampere &tm; or newer fully supported devices.
+ Supported on Linux, Windows TCC.
+
+ @param isAcceptingWork                      GPU accepting new work, NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE or
+                                             NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE
+
+ return
+         - \ref NVML_SUCCESS                 if \a current GPUs ready state is successfully set
+         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a isAcceptingWork is invalid
+         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device*/
+    fn nvmlSystemSetConfComputeGpusReadyState(
+        isAcceptingWork: ::core::ffi::c_uint,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
+    /** Set Conf Computing key rotation threshold.
+
+ For Hopper &tm; or newer fully supported devices.
+ Supported on Linux, Windows TCC.
+
+ This function is to set the confidential compute key rotation threshold parameters.
+ \a pKeyRotationThrInfo->maxAttackerAdvantage should be in the range from
+ NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MIN to NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MAX.
+ Default value is 60.
+
+ @param pKeyRotationThrInfo                  Reference to the key rotation threshold data
+
+ @return
+         - \ref NVML_SUCCESS                 if \a key rotation threashold max attacker advantage has been set
+         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a memory is NULL
+         - \ref NVML_ERROR_INVALID_STATE     if confidential compute GPU ready state is enabled
+         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
+         - \ref NVML_ERROR_UNKNOWN           on any unexpected error*/
+    fn nvmlSystemSetConfComputeKeyRotationThresholdInfo(
+        pKeyRotationThrInfo: *mut cuda_types::nvml::nvmlConfComputeSetKeyRotationThresholdInfo_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
     /** Get Conf Computing System Settings.
 
- %HOPPER_OR_NEWER%
+ For Hopper &tm; or newer fully supported devices.
  Supported on Linux, Windows TCC.
 
  @param settings                                     System CC settings
 
  @return
-         - \ref NVML_SUCCESS                         if the query is success
-         - \ref NVML_ERROR_UNINITIALIZED             if the library has not been successfully initialized
-         - \ref NVML_ERROR_INVALID_ARGUMENT          if \a device is invalid or \a counters is NULL
-         - \ref NVML_ERROR_NOT_SUPPORTED             if the device does not support this feature
-         - \ref NVML_ERROR_GPU_IS_LOST               if the target GPU has fallen off the bus or is otherwise inaccessible
-         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the provided version is invalid/unsupported
-         - \ref NVML_ERROR_UNKNOWN                   on any unexpected error*/
+         - \ref NVML_SUCCESS                         If the query is success
+         - \ref NVML_ERROR_UNINITIALIZED             If the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT          If \a device is invalid or \a counters is NULL
+         - \ref NVML_ERROR_NOT_SUPPORTED             If the device does not support this feature
+         - \ref NVML_ERROR_GPU_IS_LOST               If the target GPU has fallen off the bus or is otherwise inaccessible
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the provided version is invalid/unsupported
+         - \ref NVML_ERROR_UNKNOWN                   On any unexpected error*/
     fn nvmlSystemGetConfComputeSettings(
         settings: *mut cuda_types::nvml::nvmlSystemConfComputeSettings_t,
     ) -> cuda_types::nvml::nvmlReturn_t;
@@ -3554,6 +3938,7 @@ extern "system" {
  @return
          - \ref NVML_SUCCESS                 if GSP firmware mode is sucessfully retrieved
          - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or any of \a isEnabled or \a defaultMode is NULL
+         - \ref NVML_ERROR_NOT_SUPPORTED     if GSP firmware is not enabled for GPU
          - \ref NVML_ERROR_UNKNOWN           on any unexpected error*/
     fn nvmlDeviceGetGspFirmwareMode(
         device: cuda_types::nvml::nvmlDevice_t,
@@ -3561,6 +3946,29 @@ extern "system" {
         defaultMode: *mut ::core::ffi::c_uint,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
+    /** Get SRAM ECC error status of this device.
+
+ For Ampere &tm; or newer fully supported devices.
+ Requires root/admin permissions.
+
+ See \ref nvmlEccSramErrorStatus_v1_t for more information on the struct.
+
+ @param device                               The identifier of the target device
+ @param status                               Returns SRAM ECC error status
+
+ @return
+         - \ref NVML_SUCCESS                          If \a limit has been set
+         - \ref NVML_ERROR_UNINITIALIZED              If the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT           If \a device is invalid or \a counters is NULL
+         - \ref NVML_ERROR_NOT_SUPPORTED              If the device does not support this feature
+         - \ref NVML_ERROR_GPU_IS_LOST                If the target GPU has fallen off the bus or is otherwise inaccessible
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH  If the version of \a nvmlEccSramErrorStatus_t is invalid
+         - \ref NVML_ERROR_UNKNOWN                    On any unexpected error*/
+    fn nvmlDeviceGetSramEccErrorStatus(
+        device: cuda_types::nvml::nvmlDevice_t,
+        status: *mut cuda_types::nvml::nvmlEccSramErrorStatus_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
     /** Queries the state of per process accounting mode.
 
  For Kepler &tm; or newer fully supported devices.
@@ -3628,8 +4036,8 @@ extern "system" {
 
  For Kepler &tm; or newer fully supported devices.
 
- To just query the number of processes ready to be queried, call this function with *count = 0 and
- pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty.
+ To query the number of processes under Accounting Mode, call this function with *count = 0 and pids=NULL.
+ The return code will be NVML_ERROR_INSUFFICIENT_SIZE with an updated count value indicating the number of processes.
 
  For more details see \ref nvmlDeviceGetAccountingStats.
 
@@ -3684,7 +4092,7 @@ extern "system" {
     #[must_use]
     /** Returns the list of retired pages by source, including pages that are pending retirement
  The address information provided from this API is the hardware address of the page that was retired.  Note
- that this does not match the virtual address used in CUDA, but will match the address information in XID 63
+ that this does not match the virtual address used in CUDA, but will match the address information in Xid 63
 
  For Kepler &tm; or newer fully supported devices.
 
@@ -3714,7 +4122,7 @@ extern "system" {
     #[must_use]
     /** Returns the list of retired pages by source, including pages that are pending retirement
  The address information provided from this API is the hardware address of the page that was retired.  Note
- that this does not match the virtual address used in CUDA, but will match the address information in XID 63
+ that this does not match the virtual address used in CUDA, but will match the address information in Xid 63
 
  \note nvmlDeviceGetRetiredPages_v2 adds an additional timestamps parameter to return the time of each page's
        retirement.
@@ -3927,22 +4335,43 @@ extern "system" {
  @param procesesUtilInfo          Pointer to the caller-provided structure of nvmlProcessesUtilizationInfo_t.
 
  @return
-         - \ref NVML_SUCCESS                  if \a procesesUtilInfo->procUtilArray has been populated
-         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
-         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a device is invalid, or \a procesesUtilInfo is NULL
-         - \ref NVML_ERROR_NOT_SUPPORTED      if the device does not support this feature
-         - \ref NVML_ERROR_NOT_FOUND          if sample entries are not found
-         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
-         - \ref NVML_ERROR_VERSION_MISMATCH   if the version of \a procesesUtilInfo is invalid
-         - \ref NVML_ERROR_INSUFFICIENT_SIZE  if \a procesesUtilInfo->procUtilArray is NULL, or the buffer size of procesesUtilInfo->procUtilArray is too small.
-                                              The caller should check the minimul array size from the returned procesesUtilInfo->processSamplesCount, and call
-                                              the function again with a buffer no smaller than procesesUtilInfo->processSamplesCount * sizeof(nvmlProcessUtilizationInfo_t)
-         - \ref NVML_ERROR_UNKNOWN            on any unexpected error*/
+         - \ref NVML_SUCCESS                          If \a procesesUtilInfo->procUtilArray has been populated
+         - \ref NVML_ERROR_UNINITIALIZED              If the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT           If \a device is invalid, or \a procesesUtilInfo is NULL
+         - \ref NVML_ERROR_NOT_SUPPORTED              If the device does not support this feature
+         - \ref NVML_ERROR_NOT_FOUND                  If sample entries are not found
+         - \ref NVML_ERROR_GPU_IS_LOST                If the target GPU has fallen off the bus or is otherwise inaccessible
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH  If the version of \a procesesUtilInfo is invalid
+         - \ref NVML_ERROR_INSUFFICIENT_SIZE          If \a procesesUtilInfo->procUtilArray is NULL, or the buffer size of procesesUtilInfo->procUtilArray is too small.
+                                                      The caller should check the minimul array size from the returned procesesUtilInfo->processSamplesCount, and call
+                                                      the function again with a buffer no smaller than procesesUtilInfo->processSamplesCount * sizeof(nvmlProcessUtilizationInfo_t)
+         - \ref NVML_ERROR_UNKNOWN                    On any unexpected error*/
     fn nvmlDeviceGetProcessesUtilizationInfo(
         device: cuda_types::nvml::nvmlDevice_t,
         procesesUtilInfo: *mut cuda_types::nvml::nvmlProcessesUtilizationInfo_t,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
+    /** Get platform information of this device.
+
+ %BLACKWELL_OR_NEWER%
+
+ See \ref nvmlPlatformInfo_v1_t for more information on the struct.
+
+ @param device                               The identifier of the target device
+ @param platformInfo                         Pointer to the caller-provided structure of nvmlPlatformInfo_t.
+
+ @return
+         - \ref NVML_SUCCESS                          If \a platformInfo has been retrieved
+         - \ref NVML_ERROR_INVALID_ARGUMENT           If \a device is invalid or \a platformInfo is NULL
+         - \ref NVML_ERROR_NOT_SUPPORTED              If the device does not support this feature
+         - \ref NVML_ERROR_MEMORY                     if system memory is insufficient
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH  If the version of \a nvmlPlatformInfo_t is invalid
+         - \ref NVML_ERROR_UNKNOWN                    On any unexpected error*/
+    fn nvmlDeviceGetPlatformInfo(
+        device: cuda_types::nvml::nvmlDevice_t,
+        platformInfo: *mut cuda_types::nvml::nvmlPlatformInfo_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
     /** Set the LED state for the unit. The LED can be either green (0) or amber (1).
 
  For S-class products.
@@ -4596,7 +5025,10 @@ extern "system" {
         speed: ::core::ffi::c_uint,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
-    /** Set the GPCCLK VF offset value
+    /** Deprecated: Will be deprecated in a future release. Use \ref nvmlDeviceSetClockOffsets instead. It works
+             on Maxwell onwards GPU architectures.
+
+ Set the GPCCLK VF offset value
  @param[in]   device                         The identifier of the target device
  @param[in]   offset                         The GPCCLK VF offset value to set
 
@@ -4612,7 +5044,10 @@ extern "system" {
         offset: ::core::ffi::c_int,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
-    /** Set the MemClk (Memory Clock) VF offset value. It requires elevated privileges.
+    /** Deprecated: Will be deprecated in a future release. Use \ref nvmlDeviceSetClockOffsets instead. It works
+             on Maxwell onwards GPU architectures.
+
+ Set the MemClk (Memory Clock) VF offset value. It requires elevated privileges.
  @param[in]   device                         The identifier of the target device
  @param[in]   offset                         The MemClk VF offset value to set
 
@@ -4628,64 +5063,6 @@ extern "system" {
         offset: ::core::ffi::c_int,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
-    /** Set Conf Computing Unprotected Memory Size.
-
- For Ampere &tm; or newer fully supported devices.
- Supported on Linux, Windows TCC.
-
- @param device                               Device Handle
- @param sizeKiB                              Unprotected Memory size to be set in KiB
-
- @return
-         - \ref NVML_SUCCESS                 if \a sizeKiB successfully set
-         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
-         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
-         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device*/
-    fn nvmlDeviceSetConfComputeUnprotectedMemSize(
-        device: cuda_types::nvml::nvmlDevice_t,
-        sizeKiB: ::core::ffi::c_ulonglong,
-    ) -> cuda_types::nvml::nvmlReturn_t;
-    #[must_use]
-    /** Set Conf Computing GPUs ready state.
-
- For Ampere &tm; or newer fully supported devices.
- Supported on Linux, Windows TCC.
-
- @param isAcceptingWork                      GPU accepting new work, NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE or
-                                             NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE
-
- return
-         - \ref NVML_SUCCESS                 if \a current GPUs ready state is successfully set
-         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
-         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a isAcceptingWork is invalid
-         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device*/
-    fn nvmlSystemSetConfComputeGpusReadyState(
-        isAcceptingWork: ::core::ffi::c_uint,
-    ) -> cuda_types::nvml::nvmlReturn_t;
-    #[must_use]
-    /** Set Conf Computing key rotation threshold.
-
- %HOPPER_OR_NEWER%
- Supported on Linux, Windows TCC.
-
- This function is to set the confidential compute key rotation threshold parameters.
- @ref pKeyRotationThrInfo->maxAttackerAdvantage should be in the range from
- NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MIN to NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MAX.
- Default value is 60.
-
- @param pKeyRotationThrInfo                  Reference to the key rotation threshold data
-
- @return
-         - \ref NVML_SUCCESS                 if \a key rotation threashold max attacker advantage has been set
-         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
-         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a memory is NULL
-         - \ref NVML_ERROR_INVALID_STATE     if confidential compute GPU ready state is enabled
-         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
-         - \ref NVML_ERROR_UNKNOWN           on any unexpected error*/
-    fn nvmlSystemSetConfComputeKeyRotationThresholdInfo(
-        pKeyRotationThrInfo: *mut cuda_types::nvml::nvmlConfComputeSetKeyRotationThresholdInfo_t,
-    ) -> cuda_types::nvml::nvmlReturn_t;
-    #[must_use]
     /** Enables or disables per process accounting.
 
  For Kepler &tm; or newer fully supported devices.
@@ -4741,6 +5118,41 @@ extern "system" {
         device: cuda_types::nvml::nvmlDevice_t,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
+    /** Set new power limit of this device.
+
+ For Kepler &tm; or newer fully supported devices.
+ Requires root/admin permissions.
+
+ See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values.
+
+ See \ref nvmlPowerValue_v2_t for more information on the struct.
+
+ \note Limit is not persistent across reboots or driver unloads.
+ Enable persistent mode to prevent driver from unloading when no application is using the device.
+
+ This API replaces nvmlDeviceSetPowerManagementLimit. It can be used as a drop-in replacement for the older version.
+
+ @param device                               The identifier of the target device
+ @param powerValue                           Power management limit in milliwatts to set
+
+ @return
+         - \ref NVML_SUCCESS                 if \a limit has been set
+         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a powerValue is NULL or contains invalid values
+         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+
+ @see NVML_FI_DEV_POWER_AVERAGE
+ @see NVML_FI_DEV_POWER_INSTANT
+ @see NVML_FI_DEV_POWER_MIN_LIMIT
+ @see NVML_FI_DEV_POWER_MAX_LIMIT
+ @see NVML_FI_DEV_POWER_CURRENT_LIMIT*/
+    fn nvmlDeviceSetPowerManagementLimit_v2(
+        device: cuda_types::nvml::nvmlDevice_t,
+        powerValue: *mut cuda_types::nvml::nvmlPowerValue_v2_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
     /** Retrieves the state of the device's NvLink for the link specified
 
  For Pascal &tm; or newer fully supported devices.
@@ -4769,7 +5181,7 @@ extern "system" {
 
  @param device                               The identifier of the target device
  @param link                                 Specifies the NvLink link to be queried
- @param version                              Requested NvLink version
+ @param version                              Requested NvLink version from nvmlNvlinkVersion_t
 
  @return
          - \ref NVML_SUCCESS                 if \a version has been set
@@ -5021,6 +5433,103 @@ extern "system" {
         pNvLinkDeviceType: *mut cuda_types::nvml::nvmlIntNvLinkDeviceType_t,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
+    /** Set NvLink Low Power Threshold for device.
+
+ For Hopper &tm; or newer fully supported devices.
+
+ @param device                               The identifier of the target device
+ @param info                                 Reference to \a nvmlNvLinkPowerThres_t struct
+                                             input parameters
+
+ @return
+        - \ref NVML_SUCCESS                 if the \a Threshold is successfully set
+        - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+        - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a Threshold is not within range
+        - \ref NVML_ERROR_NOT_READY         if an internal driver setting prevents the threshold from being used
+        - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
+*/
+    fn nvmlDeviceSetNvLinkDeviceLowPowerThreshold(
+        device: cuda_types::nvml::nvmlDevice_t,
+        info: *mut cuda_types::nvml::nvmlNvLinkPowerThres_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
+    /** Set the global nvlink bandwith mode
+
+ @param nvlinkBwMode             nvlink bandwidth mode
+ @return
+         - \ref NVML_SUCCESS                on success
+         - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid argument is provided
+         - \ref NVML_ERROR_IN_USE           if P2P object exists
+         - \ref NVML_ERROR_NOT_SUPPORTED    if GPU is not Hopper or newer architecture.
+         - \ref NVML_ERROR_NO_PERMISSION    if not root user*/
+    fn nvmlSystemSetNvlinkBwMode(
+        nvlinkBwMode: ::core::ffi::c_uint,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
+    /** Get the global nvlink bandwith mode
+
+ @param nvlinkBwMode             reference of nvlink bandwidth mode
+ @return
+         - \ref NVML_SUCCESS                on success
+         - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid pointer is provided
+         - \ref NVML_ERROR_NOT_SUPPORTED    if GPU is not Hopper or newer architecture.
+         - \ref NVML_ERROR_NO_PERMISSION    if not root user*/
+    fn nvmlSystemGetNvlinkBwMode(
+        nvlinkBwMode: *mut ::core::ffi::c_uint,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
+    /** Get the supported NvLink Reduced Bandwidth Modes of the device
+
+ %BLACKWELL_OR_NEWER%
+
+ @param device                                      The identifier of the target device
+ @param supportedBwMode                             Reference to \a nvmlNvlinkSupportedBwModes_t
+
+ @return
+        - \ref NVML_SUCCESS                         if the query was successful
+        - \ref NVML_ERROR_INVALID_ARGUMENT          if device is invalid or supportedBwMode is NULL
+        - \ref NVML_ERROR_NOT_SUPPORTED             if this feature is not supported by the device
+        - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the version specified is not supported*/
+    fn nvmlDeviceGetNvlinkSupportedBwModes(
+        device: cuda_types::nvml::nvmlDevice_t,
+        supportedBwMode: *mut cuda_types::nvml::nvmlNvlinkSupportedBwModes_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
+    /** Get the NvLink Reduced Bandwidth Mode for the device
+
+ %BLACKWELL_OR_NEWER%
+
+ @param device                                      The identifier of the target device
+ @param getBwMode                                   Reference to \a nvmlNvlinkGetBwMode_t
+
+ @return
+        - \ref NVML_SUCCESS                         if the query was successful
+        - \ref NVML_ERROR_INVALID_ARGUMENT          if device is invalid or getBwMode is NULL
+        - \ref NVML_ERROR_NOT_SUPPORTED             if this feature is not supported by the device
+        - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the version specified is not supported*/
+    fn nvmlDeviceGetNvlinkBwMode(
+        device: cuda_types::nvml::nvmlDevice_t,
+        getBwMode: *mut cuda_types::nvml::nvmlNvlinkGetBwMode_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
+    /** Set the NvLink Reduced Bandwidth Mode for the device
+
+ %BLACKWELL_OR_NEWER%
+
+ @param device                                      The identifier of the target device
+ @param setBwMode                                   Reference to \a nvmlNvlinkSetBwMode_t
+
+ @return
+        - \ref NVML_SUCCESS                         if the Bandwidth mode was successfully set
+        - \ref NVML_ERROR_INVALID_ARGUMENT          if device is invalid or setBwMode is NULL
+        - \ref NVML_ERROR_NO_PERMISSION             if user does not have permission to change Bandwidth mode
+        - \ref NVML_ERROR_NOT_SUPPORTED             if this feature is not supported by the device
+        - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the version specified is not supported*/
+    fn nvmlDeviceSetNvlinkBwMode(
+        device: cuda_types::nvml::nvmlDevice_t,
+        setBwMode: *mut cuda_types::nvml::nvmlNvlinkSetBwMode_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
     /** Create an empty set of events.
  Event set should be freed by \ref nvmlEventSetFree
 
@@ -5041,7 +5550,7 @@ extern "system" {
     /** Starts recording of events on a specified devices and add the events to specified \ref nvmlEventSet_t
 
  For Fermi &tm; or newer fully supported devices.
- Ecc events are available only on ECC enabled devices (see \ref nvmlDeviceGetTotalEccErrors)
+ ECC events are available only on ECC-enabled devices (see \ref nvmlDeviceGetTotalEccErrors)
  Power capping events are available only on Power Management enabled devices (see \ref nvmlDeviceGetPowerManagementMode)
 
  For Linux only.
@@ -5110,11 +5619,11 @@ extern "system" {
  but not longer than specified timeout. This function in certain conditions can return before
  specified timeout passes (e.g. when interrupt arrives)
 
- On Windows, in case of xid error, the function returns the most recent xid error type seen by the system.
- If there are multiple xid errors generated before nvmlEventSetWait is invoked then the last seen xid error
- type is returned for all xid error events.
+ On Windows, in case of Xid error, the function returns the most recent Xid error type seen by the system.
+ If there are multiple Xid errors generated before nvmlEventSetWait is invoked then the last seen Xid error
+ type is returned for all Xid error events.
 
- On Linux, every xid error event would return the associated event data and other information if applicable.
+ On Linux, every Xid error event would return the associated event data and other information if applicable.
 
  In MIG mode, if device handle is provided, the API reports all the events for the available instances,
  only if the caller has appropriate privileges. In absence of required privileges, only the events which
@@ -5374,12 +5883,12 @@ extern "system" {
  @param pHeterogeneousMode                   Pointer to the caller-provided structure of nvmlVgpuHeterogeneousMode_t
 
  @return
-         - \ref NVML_SUCCESS                 Upon success
-         - \ref NVML_ERROR_UNINITIALIZED     If library has not been successfully initialized
-         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a device is invalid or \a pHeterogeneousMode is NULL
-         - \ref NVML_ERROR_NOT_SUPPORTED     If \a device doesn't support this feature
-         - \ref NVML_ERROR_VERSION_MISMATCH  If the version of \a pHeterogeneousMode is invalid
-         - \ref NVML_ERROR_UNKNOWN           On any unexpected error*/
+         - \ref NVML_SUCCESS                          Upon success
+         - \ref NVML_ERROR_UNINITIALIZED              If library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT           If \a device is invalid or \a pHeterogeneousMode is NULL
+         - \ref NVML_ERROR_NOT_SUPPORTED              If \a device doesn't support this feature
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH  If the version of \a pHeterogeneousMode is invalid
+         - \ref NVML_ERROR_UNKNOWN                    On any unexpected error*/
     fn nvmlDeviceGetVgpuHeterogeneousMode(
         device: cuda_types::nvml::nvmlDevice_t,
         pHeterogeneousMode: *mut cuda_types::nvml::nvmlVgpuHeterogeneousMode_t,
@@ -5392,6 +5901,8 @@ extern "system" {
  API would return an appropriate error code upon unsuccessful activation. For example, the heterogeneous mode
  set will fail with error \ref NVML_ERROR_IN_USE if any vGPU instance is active on the device. The caller of this API
  is expected to shutdown the vGPU VMs and retry setting the \a mode.
+ On KVM platform, setting heterogeneous mode is allowed, if no MDEV device is created on the device, else will fail
+ with same error \ref NVML_ERROR_IN_USE.
  On successful return, the function updates the vGPU heterogeneous mode with the user provided \a pHeterogeneousMode->mode.
  \a pHeterogeneousMode->version is the version number of the structure nvmlVgpuHeterogeneousMode_t, the caller should
  set the correct version number to set the vGPU heterogeneous mode.
@@ -5400,14 +5911,14 @@ extern "system" {
  @param pHeterogeneousMode                   Pointer to the caller-provided structure of nvmlVgpuHeterogeneousMode_t
 
  @return
-         - \ref NVML_SUCCESS                 Upon success
-         - \ref NVML_ERROR_UNINITIALIZED     If library has not been successfully initialized
-         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a device or \a pHeterogeneousMode is NULL or \a pHeterogeneousMode->mode is invalid
-         - \ref NVML_ERROR_IN_USE            If the \a device is in use
-         - \ref NVML_ERROR_NO_PERMISSION     If user doesn't have permission to perform the operation
-         - \ref NVML_ERROR_NOT_SUPPORTED     If MIG is enabled or \a device doesn't support this feature
-         - \ref NVML_ERROR_VERSION_MISMATCH  If the version of \a pHeterogeneousMode is invalid
-         - \ref NVML_ERROR_UNKNOWN           On any unexpected error*/
+         - \ref NVML_SUCCESS                          Upon success
+         - \ref NVML_ERROR_UNINITIALIZED              If library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT           If \a device or \a pHeterogeneousMode is NULL or \a pHeterogeneousMode->mode is invalid
+         - \ref NVML_ERROR_IN_USE                     If the \a device is in use
+         - \ref NVML_ERROR_NO_PERMISSION              If user doesn't have permission to perform the operation
+         - \ref NVML_ERROR_NOT_SUPPORTED              If MIG is enabled or \a device doesn't support this feature
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH  If the version of \a pHeterogeneousMode is invalid
+         - \ref NVML_ERROR_UNKNOWN                    On any unexpected error*/
     fn nvmlDeviceSetVgpuHeterogeneousMode(
         device: cuda_types::nvml::nvmlDevice_t,
         pHeterogeneousMode: *const cuda_types::nvml::nvmlVgpuHeterogeneousMode_t,
@@ -5424,11 +5935,11 @@ extern "system" {
  @param pPlacement                           Pointer to vGPU placement ID structure \a nvmlVgpuPlacementId_t
 
  @return
-         - \ref NVML_SUCCESS                 If information is successfully retrieved
-         - \ref NVML_ERROR_NOT_FOUND         If \a vgpuInstance does not match a valid active vGPU instance
-         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a vgpuInstance is invalid or \a pPlacement is NULL
-         - \ref NVML_ERROR_VERSION_MISMATCH  If the version of \a pPlacement is invalid
-         - \ref NVML_ERROR_UNKNOWN           On any unexpected error*/
+         - \ref NVML_SUCCESS                          If information is successfully retrieved
+         - \ref NVML_ERROR_NOT_FOUND                  If \a vgpuInstance does not match a valid active vGPU instance
+         - \ref NVML_ERROR_INVALID_ARGUMENT           If \a vgpuInstance is invalid or \a pPlacement is NULL
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH  If the version of \a pPlacement is invalid
+         - \ref NVML_ERROR_UNKNOWN                    On any unexpected error*/
     fn nvmlVgpuInstanceGetPlacementId(
         vgpuInstance: cuda_types::nvml::nvmlVgpuInstance_t,
         pPlacement: *mut cuda_types::nvml::nvmlVgpuPlacementId_t,
@@ -5436,24 +5947,30 @@ extern "system" {
     #[must_use]
     /** Query the supported vGPU placement ID of the vGPU type.
 
- An array of supported vGPU placement IDs for the vGPU type ID indicated by \a vgpuTypeId is returned in the
- caller-supplied buffer of \a pPlacementList->placementIds. Memory needed for the placementIds array should be
- allocated based on maximum instances of a vGPU type which can be queried via \ref nvmlVgpuTypeGetMaxInstances().
+ The function returns an array of supported vGPU placement IDs for the specified vGPU type ID in the buffer provided
+ by the caller at \a pPlacementList->placementIds. The required memory for the placementIds array must be allocated
+ based on the maximum number of vGPU type instances, which is retrievable through \ref nvmlVgpuTypeGetMaxInstances().
+ If the provided count by the caller is insufficient, the function will return NVML_ERROR_INSUFFICIENT_SIZE along with
+ the number of required entries in \a pPlacementList->count. The caller should then reallocate a buffer with the size
+ of pPlacementList->count * sizeof(pPlacementList->placementIds) and invoke the function again.
 
- This function will return supported placement IDs even if GPU is not in vGPU heterogeneous mode.
+ To obtain a list of homogeneous placement IDs, the caller needs to set \a pPlacementList->mode to NVML_VGPU_PGPU_HOMOGENEOUS_MODE.
+ For heterogeneous placement IDs, \a pPlacementList->mode should be set to NVML_VGPU_PGPU_HETEROGENEOUS_MODE.
+ By default, a list of heterogeneous placement IDs is returned.
 
  @param device                               Identifier of the target device
  @param vgpuTypeId                           Handle to vGPU type. The vGPU type ID
  @param pPlacementList                       Pointer to the vGPU placement structure \a nvmlVgpuPlacementList_t
 
  @return
-         - \ref NVML_SUCCESS                 Upon success
-         - \ref NVML_ERROR_UNINITIALIZED     If library has not been successfully initialized
-         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a device or \a vgpuTypeId is invalid or \a pPlacementList is NULL
-         - \ref NVML_ERROR_NOT_SUPPORTED     If \a device or \a vgpuTypeId isn't supported
-         - \ref NVML_ERROR_NO_PERMISSION     If user doesn't have permission to perform the operation
-         - \ref NVML_ERROR_VERSION_MISMATCH  If the version of \a pPlacementList is invalid
-         - \ref NVML_ERROR_UNKNOWN           On any unexpected error*/
+         - \ref NVML_SUCCESS                          Upon success
+         - \ref NVML_ERROR_UNINITIALIZED              If library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT           If \a device or \a vgpuTypeId is invalid or \a pPlacementList is NULL
+         - \ref NVML_ERROR_NOT_SUPPORTED              If \a device or \a vgpuTypeId isn't supported
+         - \ref NVML_ERROR_NO_PERMISSION              If user doesn't have permission to perform the operation
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH  If the version of \a pPlacementList is invalid
+         - \ref NVML_ERROR_INSUFFICIENT_SIZE          If the buffer is small, element count is returned in \a pPlacementList->count
+         - \ref NVML_ERROR_UNKNOWN                    On any unexpected error*/
     fn nvmlDeviceGetVgpuTypeSupportedPlacements(
         device: cuda_types::nvml::nvmlDevice_t,
         vgpuTypeId: cuda_types::nvml::nvmlVgpuTypeId_t,
@@ -5465,23 +5982,25 @@ extern "system" {
  An array of creatable vGPU placement IDs for the vGPU type ID indicated by \a vgpuTypeId is returned in the
  caller-supplied buffer of \a pPlacementList->placementIds. Memory needed for the placementIds array should be
  allocated based on maximum instances of a vGPU type which can be queried via \ref nvmlVgpuTypeGetMaxInstances().
+ If the provided count by the caller is insufficient, the function will return NVML_ERROR_INSUFFICIENT_SIZE along with
+ the number of required entries in \a pPlacementList->count. The caller should then reallocate a buffer with the size
+ of pPlacementList->count * sizeof(pPlacementList->placementIds) and invoke the function again.
+
  The creatable vGPU placement IDs may differ over time, as there may be restrictions on what type of vGPU the
  vGPU instance is running.
 
- The function will return \ref NVML_ERROR_NOT_SUPPORTED if the \a device is not in vGPU heterogeneous mode.
-
  @param device                               The identifier of the target device
  @param vgpuTypeId                           Handle to vGPU type. The vGPU type ID
  @param pPlacementList                       Pointer to the list of vGPU placement structure \a nvmlVgpuPlacementList_t
 
  @return
-         - \ref NVML_SUCCESS                 Upon success
-         - \ref NVML_ERROR_UNINITIALIZED     If library has not been successfully initialized
-         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a device or \a vgpuTypeId is invalid or \a pPlacementList is NULL
-         - \ref NVML_ERROR_NOT_SUPPORTED     If \a device or \a vgpuTypeId isn't supported
-         - \ref NVML_ERROR_NO_PERMISSION     If user doesn't have permission to perform the operation
-         - \ref NVML_ERROR_VERSION_MISMATCH  If the version of \a pPlacementList is invalid
-         - \ref NVML_ERROR_UNKNOWN           On any unexpected error*/
+         - \ref NVML_SUCCESS                          Upon success
+         - \ref NVML_ERROR_UNINITIALIZED              If library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT           If \a device or \a vgpuTypeId is invalid or \a pPlacementList is NULL
+         - \ref NVML_ERROR_NOT_SUPPORTED              If \a device or \a vgpuTypeId isn't supported
+         - \ref NVML_ERROR_NO_PERMISSION              If user doesn't have permission to perform the operation
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH  If the version of \a pPlacementList is invalid
+         - \ref NVML_ERROR_UNKNOWN                    On any unexpected error*/
     fn nvmlDeviceGetVgpuTypeCreatablePlacements(
         device: cuda_types::nvml::nvmlDevice_t,
         vgpuTypeId: cuda_types::nvml::nvmlVgpuTypeId_t,
@@ -5516,6 +6035,28 @@ extern "system" {
         fbReservation: *mut ::core::ffi::c_ulonglong,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
+    /** Retrieve the currently used runtime state size of the vGPU instance
+
+ This size represents the maximum in-memory data size utilized by a vGPU instance during standard operation.
+ This measurement is exclusive of frame buffer (FB) data size assigned to the vGPU instance.
+
+ For Maxwell &tm; or newer fully supported devices.
+
+ @param vgpuInstance                         Identifier of the target vGPU instance
+ @param pState                               Pointer to the vGPU runtime state's structure \a nvmlVgpuRuntimeState_t
+
+ @return
+         - \ref NVML_SUCCESS                          If information is successfully retrieved
+         - \ref NVML_ERROR_UNINITIALIZED              If the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT           If \a vgpuInstance is invalid, or \a pState is NULL
+         - \ref NVML_ERROR_NOT_FOUND                  If \a vgpuInstance does not match a valid active vGPU instance on the system
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH  If the version of \a pState is invalid
+         - \ref NVML_ERROR_UNKNOWN                    On any unexpected error*/
+    fn nvmlVgpuInstanceGetRuntimeStateSize(
+        vgpuInstance: cuda_types::nvml::nvmlVgpuInstance_t,
+        pState: *mut cuda_types::nvml::nvmlVgpuRuntimeState_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
     /** Set the desirable vGPU capability of a device
 
  Refer to the \a nvmlDeviceVgpuCapability_t structure for the specific capabilities that can be set.
@@ -5875,6 +6416,23 @@ extern "system" {
         vgpuInstanceCountPerVm: *mut ::core::ffi::c_uint,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
+    /** Retrieve the BAR1 info for given vGPU type.
+
+ For Maxwell &tm; or newer fully supported devices.
+
+ @param vgpuTypeId               Handle to vGPU type
+ @param bar1Info                 Pointer to the vGPU type BAR1 information structure \a nvmlVgpuTypeBar1Info_t
+
+ @return
+         - \ref NVML_SUCCESS                 successful completion
+         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a bar1Info is NULL
+         - \ref NVML_ERROR_UNKNOWN           on any unexpected error*/
+    fn nvmlVgpuTypeGetBAR1Info(
+        vgpuTypeId: cuda_types::nvml::nvmlVgpuTypeId_t,
+        bar1Info: *mut cuda_types::nvml::nvmlVgpuTypeBar1Info_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
     /** Retrieve the active vGPU instances on a device.
 
  An array of active vGPU instances is returned in the caller-supplied buffer pointed at by \a vgpuInstances. The
@@ -6642,17 +7200,17 @@ returned in \a sessionCount
  @param vgpuUtilInfo                  Pointer to the caller-provided structure of nvmlVgpuInstancesUtilizationInfo_t
 
  @return
-         - \ref NVML_SUCCESS                 if utilization samples are successfully retrieved
-         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
-         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a vgpuUtilInfo is NULL, or \a vgpuUtilInfo->vgpuInstanceCount is 0
-         - \ref NVML_ERROR_NOT_SUPPORTED     if vGPU is not supported by the device
-         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
-         - \ref NVML_ERROR_VERSION_MISMATCH  if the version of \a vgpuUtilInfo is invalid
-         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a vgpuUtilInfo->vgpuUtilArray is NULL, or the buffer size of vgpuUtilInfo->vgpuInstanceCount is too small.
-                                             The caller should check the current vGPU instance count from the returned vgpuUtilInfo->vgpuInstanceCount, and call
-                                             the function again with a buffer of size vgpuUtilInfo->vgpuInstanceCount * sizeof(nvmlVgpuInstanceUtilizationInfo_t)
-         - \ref NVML_ERROR_NOT_FOUND         if sample entries are not found
-         - \ref NVML_ERROR_UNKNOWN           on any unexpected error*/
+         - \ref NVML_SUCCESS                          If utilization samples are successfully retrieved
+         - \ref NVML_ERROR_UNINITIALIZED              If the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT           If \a device is invalid, \a vgpuUtilInfo is NULL, or \a vgpuUtilInfo->vgpuInstanceCount is 0
+         - \ref NVML_ERROR_NOT_SUPPORTED              If vGPU is not supported by the device
+         - \ref NVML_ERROR_GPU_IS_LOST                If the target GPU has fallen off the bus or is otherwise inaccessible
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH  If the version of \a vgpuUtilInfo is invalid
+         - \ref NVML_ERROR_INSUFFICIENT_SIZE          If \a vgpuUtilInfo->vgpuUtilArray is NULL, or the buffer size of vgpuUtilInfo->vgpuInstanceCount is too small.
+                                                      The caller should check the current vGPU instance count from the returned vgpuUtilInfo->vgpuInstanceCount, and call
+                                                      the function again with a buffer of size vgpuUtilInfo->vgpuInstanceCount * sizeof(nvmlVgpuInstanceUtilizationInfo_t)
+         - \ref NVML_ERROR_NOT_FOUND                  If sample entries are not found
+         - \ref NVML_ERROR_UNKNOWN                    On any unexpected error*/
     fn nvmlDeviceGetVgpuInstancesUtilizationInfo(
         device: cuda_types::nvml::nvmlDevice_t,
         vgpuUtilInfo: *mut cuda_types::nvml::nvmlVgpuInstancesUtilizationInfo_t,
@@ -6734,19 +7292,19 @@ returned in \a sessionCount
  @param vgpuProcUtilInfo              Pointer to the caller-provided structure of nvmlVgpuProcessesUtilizationInfo_t
 
  @return
-         - \ref NVML_SUCCESS                 if utilization samples are successfully retrieved
-         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
-         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a vgpuProcUtilInfo is null
-         - \ref NVML_ERROR_VERSION_MISMATCH  if the version of \a vgpuProcUtilInfo is invalid
-         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a vgpuProcUtilInfo->vgpuProcUtilArray is null, or supplied \a vgpuProcUtilInfo->vgpuProcessCount
-                                             is too small to return samples for all processes on vGPU instances currently executing on the device.
-                                             The caller should check the current processes count from the returned \a vgpuProcUtilInfo->vgpuProcessCount,
-                                             and call the function again with a buffer of size
-                                             vgpuProcUtilInfo->vgpuProcessCount * sizeof(nvmlVgpuProcessUtilizationSample_t)
-         - \ref NVML_ERROR_NOT_SUPPORTED     if vGPU is not supported by the device
-         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
-         - \ref NVML_ERROR_NOT_FOUND         if sample entries are not found
-         - \ref NVML_ERROR_UNKNOWN           on any unexpected error*/
+         - \ref NVML_SUCCESS                          If utilization samples are successfully retrieved
+         - \ref NVML_ERROR_UNINITIALIZED              If the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT           If \a device is invalid, or \a vgpuProcUtilInfo is null
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH  If the version of \a vgpuProcUtilInfo is invalid
+         - \ref NVML_ERROR_INSUFFICIENT_SIZE          If \a vgpuProcUtilInfo->vgpuProcUtilArray is null, or supplied \a vgpuProcUtilInfo->vgpuProcessCount
+                                                      is too small to return samples for all processes on vGPU instances currently executing on the device.
+                                                      The caller should check the current processes count from the returned \a vgpuProcUtilInfo->vgpuProcessCount,
+                                                      and call the function again with a buffer of size
+                                                      vgpuProcUtilInfo->vgpuProcessCount * sizeof(nvmlVgpuProcessUtilizationSample_t)
+         - \ref NVML_ERROR_NOT_SUPPORTED              If vGPU is not supported by the device
+         - \ref NVML_ERROR_GPU_IS_LOST                If the target GPU has fallen off the bus or is otherwise inaccessible
+         - \ref NVML_ERROR_NOT_FOUND                  If sample entries are not found
+         - \ref NVML_ERROR_UNKNOWN                    On any unexpected error*/
     fn nvmlDeviceGetVgpuProcessesUtilizationInfo(
         device: cuda_types::nvml::nvmlDevice_t,
         vgpuProcUtilInfo: *mut cuda_types::nvml::nvmlVgpuProcessesUtilizationInfo_t,
@@ -7717,7 +8275,7 @@ returned in \a sessionCount
     #[must_use]
     /** Get GPM stream state.
 
- %HOPPER_OR_NEWER%
+ For Hopper &tm; or newer fully supported devices.
  Supported on Linux, Windows TCC.
 
  @param device                               The identifier of the target device
@@ -7736,7 +8294,7 @@ returned in \a sessionCount
     #[must_use]
     /** Set GPM stream state.
 
- %HOPPER_OR_NEWER%
+ For Hopper &tm; or newer fully supported devices.
  Supported on Linux, Windows TCC.
 
  @param device                               The identifier of the target device
@@ -7753,105 +8311,191 @@ returned in \a sessionCount
         state: ::core::ffi::c_uint,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
-    /** Set NvLink Low Power Threshold for device.
+    /** Get device capabilities
 
- %HOPPER_OR_NEWER%
+ See \ref  nvmlDeviceCapabilities_v1_t for more information on the struct.
 
  @param device                               The identifier of the target device
- @param info                                 Reference to \a nvmlNvLinkPowerThres_t struct
-                                             input parameters
+ @param caps                                 Returns GPU's capabilities
 
  @return
-        - \ref NVML_SUCCESS                 if the \a Threshold is successfully set
-        - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
-        - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a Threshold is not within range
-        - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
+         - \ref NVML_SUCCESS                         If the query is success
+         - \ref NVML_ERROR_UNINITIALIZED             If the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT          If \a device is invalid or \a counters is NULL
+         - \ref NVML_ERROR_NOT_SUPPORTED             If the device does not support this feature
+         - \ref NVML_ERROR_GPU_IS_LOST               If the target GPU has fallen off the bus or is otherwise inaccessible
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the provided version is invalid/unsupported
+         - \ref NVML_ERROR_UNKNOWN                   On any unexpected error*/
+    fn nvmlDeviceGetCapabilities(
+        device: cuda_types::nvml::nvmlDevice_t,
+        caps: *mut cuda_types::nvml::nvmlDeviceCapabilities_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
+    /** Get Performance Profiles Information
+
+ %BLACKWELL_OR_NEWER%
+ See \ref nvmlWorkloadPowerProfileProfilesInfo_v1_t for more information on the struct.
+ The mask \a perfProfilesMask is bitmask of all supported mode indices where the
+ mode is supported if the index is 1. Each supported mode will have a corresponding
+ entry in the \a perfProfile array which will contain the \a profileId, the
+ \a priority of this mode, where the lower the value, the higher the priority,
+ and a \a conflictingMask, where each bit set in the mask corresponds to a different
+ profile which cannot be used in conjunction with the given profile.
+
+ @param device                               The identifier of the target device
+ @param profilesInfo                         Reference to struct \a nvmlWorkloadPowerProfileProfilesInfo_t
+
+ @return
+         - \ref NVML_SUCCESS                         If the query is successful
+         - \ref NVML_ERROR_INSUFFICIENT_SIZE         If struct is fully allocated
+         - \ref NVML_ERROR_UNINITIALIZED             If the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT          If \a device is invalid or \a pointer to struct is NULL
+         - \ref NVML_ERROR_NOT_SUPPORTED             If the device does not support this feature
+         - \ref NVML_ERROR_GPU_IS_LOST               If the target GPU has fallen off the bus or is otherwise inaccessible
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the provided version is invalid/unsupported
+         - \ref NVML_ERROR_UNKNOWN                   On any unexpected error*/
+    fn nvmlDeviceWorkloadPowerProfileGetProfilesInfo(
+        device: cuda_types::nvml::nvmlDevice_t,
+        profilesInfo: *mut cuda_types::nvml::nvmlWorkloadPowerProfileProfilesInfo_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
+    /** Get Current Performance Profiles
+
+ %BLACKWELL_OR_NEWER%
+ See \ref nvmlWorkloadPowerProfileCurrentProfiles_v1_t for more information on the struct.
+ This API returns a stuct which contains the current \a perfProfilesMask,
+ \a requestedProfilesMask and \a enforcedProfilesMask. Each bit set in each
+ bitmasks indicates the profile is supported, currently requested or currently
+ engaged, respectively.
+
+ @param device                The identifier of the target device
+ @param currentProfiles       Reference to struct \a nvmlWorkloadPowerProfileCurrentProfiles_v1_t
+
+ @return
+         - \ref NVML_SUCCESS                         If the query is successful
+         - \ref NVML_ERROR_UNINITIALIZED             If the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT          If \a device is invalid or the pointer to struct is NULL
+         - \ref NVML_ERROR_NOT_SUPPORTED             If the device does not support this feature
+         - \ref NVML_ERROR_GPU_IS_LOST               If the target GPU has fallen off the bus or is otherwise inaccessible
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the provided version is invalid/unsupported
+         - \ref NVML_ERROR_UNKNOWN                   On any unexpected error*/
+    fn nvmlDeviceWorkloadPowerProfileGetCurrentProfiles(
+        device: cuda_types::nvml::nvmlDevice_t,
+        currentProfiles: *mut cuda_types::nvml::nvmlWorkloadPowerProfileCurrentProfiles_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
+    /** Set Requested Performance Profiles
+
+ %BLACKWELL_OR_NEWER%
+ See \ref nvmlWorkloadPowerProfileRequestedProfiles_v1_t for more information on the struct.
+ Reuqest one or more performance profiles be activated using the input bitmask
+ \a requestedProfilesMask, where each bit set corresponds to a supported bit from
+ the \a perfProfilesMask. These profiles will be added to existing list of
+ currently requested profiles.
+
+ @param device                The identifier of the target device
+ @param requestedProfiles     Reference to struct \a nvmlWorkloadPowerProfileRequestedProfiles_v1_t
+
+ @return
+         - \ref NVML_SUCCESS                         If the query is successful
+         - \ref NVML_ERROR_UNINITIALIZED             If the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT          If \a device is invalid or \a pointer to struct is NULL
+         - \ref NVML_ERROR_NOT_SUPPORTED             If the device does not support this feature
+         - \ref NVML_ERROR_GPU_IS_LOST               If the target GPU has fallen off the bus or is otherwise inaccessible
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the provided version is invalid/unsupported
+         - \ref NVML_ERROR_UNKNOWN                   On any unexpected error*/
+    fn nvmlDeviceWorkloadPowerProfileSetRequestedProfiles(
+        device: cuda_types::nvml::nvmlDevice_t,
+        requestedProfiles: *mut cuda_types::nvml::nvmlWorkloadPowerProfileRequestedProfiles_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
+    /** Clear Requested Performance Profiles
+
+ %BLACKWELL_OR_NEWER%
+ See \ref nvmlWorkloadPowerProfileRequestedProfiles_v1_t for more information on the struct.
+ Clear one or more performance profiles be using the input bitmask
+ \a requestedProfilesMask, where each bit set corresponds to a supported bit from
+ the \a perfProfilesMask. These profiles will be removed from the existing list of
+ currently requested profiles.
+
+ @param device                The identifier of the target device
+ @param requestedProfiles     Reference to struct \a nvmlWorkloadPowerProfileRequestedProfiles_v1_t
+
+ @return
+         - \ref NVML_SUCCESS                         If the query is successful
+         - \ref NVML_ERROR_UNINITIALIZED             If the library has not been successfully initialized
+         - \ref NVML_ERROR_INVALID_ARGUMENT          If \a device is invalid or \a pointer to struct is NULL
+         - \ref NVML_ERROR_NOT_SUPPORTED             If the device does not support this feature
+         - \ref NVML_ERROR_GPU_IS_LOST               If the target GPU has fallen off the bus or is otherwise inaccessible
+         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the provided version is invalid/unsupported
+         - \ref NVML_ERROR_UNKNOWN                   On any unexpected error*/
+    fn nvmlDeviceWorkloadPowerProfileClearRequestedProfiles(
+        device: cuda_types::nvml::nvmlDevice_t,
+        requestedProfiles: *mut cuda_types::nvml::nvmlWorkloadPowerProfileRequestedProfiles_t,
+    ) -> cuda_types::nvml::nvmlReturn_t;
+    #[must_use]
+    /** Activiate a specific preset profile for datacenter power smoothing
+ The API only sets the active preset profile based on the input profileId,
+ and ignores the other parameters of the structure.
+
+ %BLACKWELL_OR_NEWER%
+
+ @param device                                The identifier of the target device
+ @param profile                               Reference to \ref nvmlPowerSmoothingProfile_t.
+                                              Note that only \a profile->profileId is used and
+                                              the rest of the structure is ignored.
+
+ @return
+        - \ref NVML_SUCCESS                   if the Desired Profile was successfully set
+        - \ref NVML_ERROR_INVALID_ARGUMENT    if device is invalid or structure was NULL
+        - \ref NVML_ERROR_NO_PERMISSION       if user does not have permission to change the profile number
+        - \ref NVML_ERROR_NOT_SUPPORTED       if this feature is not supported by the device
 */
-    fn nvmlDeviceSetNvLinkDeviceLowPowerThreshold(
+    fn nvmlDevicePowerSmoothingActivatePresetProfile(
         device: cuda_types::nvml::nvmlDevice_t,
-        info: *mut cuda_types::nvml::nvmlNvLinkPowerThres_t,
+        profile: *mut cuda_types::nvml::nvmlPowerSmoothingProfile_t,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
-    /** Set the global nvlink bandwith mode
+    /** Update the value of a specific profile parameter contained within \ref nvmlPowerSmoothingProfile_t
 
- @param nvlinkBwMode             nvlink bandwidth mode
- @return
-         - \ref NVML_SUCCESS                on success
-         - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid argument is provided
-         - \ref NVML_ERROR_IN_USE           if P2P object exists
-         - \ref NVML_ERROR_NOT_SUPPORTED    if GPU is not Hopper or newer architecture.
-         - \ref NVML_ERROR_NO_PERMISSION    if not root user*/
-    fn nvmlSystemSetNvlinkBwMode(
-        nvlinkBwMode: ::core::ffi::c_uint,
-    ) -> cuda_types::nvml::nvmlReturn_t;
-    #[must_use]
-    /** Get the global nvlink bandwith mode
+ %BLACKWELL_OR_NEWER%
 
- @param nvlinkBwMode             reference of nvlink bandwidth mode
- @return
-         - \ref NVML_SUCCESS                on success
-         - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid pointer is provided
-         - \ref NVML_ERROR_NOT_SUPPORTED    if GPU is not Hopper or newer architecture.
-         - \ref NVML_ERROR_NO_PERMISSION    if not root user*/
-    fn nvmlSystemGetNvlinkBwMode(
-        nvlinkBwMode: *mut ::core::ffi::c_uint,
-    ) -> cuda_types::nvml::nvmlReturn_t;
-    #[must_use]
-    /** Set new power limit of this device.
+ NVML_POWER_SMOOTHING_PROFILE_PARAM_PERCENT_TMP_FLOOR expects a value as a percentage from 00.00-100.00%
+ NVML_POWER_SMOOTHING_PROFILE_PARAM_RAMP_UP_RATE expects a value in W/s
+ NVML_POWER_SMOOTHING_PROFILE_PARAM_RAMP_DOWN_RATE expects a value in W/s
+ NVML_POWER_SMOOTHING_PROFILE_PARAM_RAMP_DOWN_HYSTERESIS expects a value in ms
 
- For Kepler &tm; or newer fully supported devices.
- Requires root/admin permissions.
-
- See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values.
-
- See \ref nvmlPowerValue_v2_t for more information on the struct.
-
- \note Limit is not persistent across reboots or driver unloads.
- Enable persistent mode to prevent driver from unloading when no application is using the device.
-
- This API replaces nvmlDeviceSetPowerManagementLimit. It can be used as a drop-in replacement for the older version.
-
- @param device                               The identifier of the target device
- @param powerValue                           Power management limit in milliwatts to set
+ @param device                                      The identifier of the target device
+ @param profile                                     Reference to \ref nvmlPowerSmoothingProfile_t struct
 
  @return
-         - \ref NVML_SUCCESS                 if \a limit has been set
-         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
-         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a powerValue is NULL or contains invalid values
-         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
-         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
-         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
-
- @see NVML_FI_DEV_POWER_AVERAGE
- @see NVML_FI_DEV_POWER_INSTANT
- @see NVML_FI_DEV_POWER_MIN_LIMIT
- @see NVML_FI_DEV_POWER_MAX_LIMIT
- @see NVML_FI_DEV_POWER_CURRENT_LIMIT*/
-    fn nvmlDeviceSetPowerManagementLimit_v2(
+        - \ref NVML_SUCCESS                         if the Active Profile was successfully set
+        - \ref NVML_ERROR_INVALID_ARGUMENT          if device is invalid or profile parameter/value was invalid
+        - \ref NVML_ERROR_NO_PERMISSION             if user does not have permission to change any profile parameters
+        - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the structure version is not supported
+*/
+    fn nvmlDevicePowerSmoothingUpdatePresetProfileParam(
         device: cuda_types::nvml::nvmlDevice_t,
-        powerValue: *mut cuda_types::nvml::nvmlPowerValue_v2_t,
+        profile: *mut cuda_types::nvml::nvmlPowerSmoothingProfile_t,
     ) -> cuda_types::nvml::nvmlReturn_t;
     #[must_use]
-    /** Get SRAM ECC error status of this device.
+    /** Enable or disable the Power Smoothing Feature
 
- For Ampere &tm; or newer fully supported devices.
- Requires root/admin permissions.
+ %BLACKWELL_OR_NEWER%
 
- See \ref nvmlEccSramErrorStatus_v1_t for more information on the struct.
+ See \ref nvmlEnableState_t for details on allowed states
 
- @param device                               The identifier of the target device
- @param status                               Returns SRAM ECC error status
+ @param device                                      The identifier of the target device
+ @param state                                       Reference to \ref nvmlPowerSmoothingState_t
 
  @return
-         - \ref NVML_SUCCESS                 if \a limit has been set
-         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
-         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a counters is NULL
-         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
-         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
-         - \ref NVML_ERROR_VERSION_MISMATCH  if the version of \a nvmlEccSramErrorStatus_t is invalid
-         - \ref NVML_ERROR_UNKNOWN           on any unexpected error*/
-    fn nvmlDeviceGetSramEccErrorStatus(
+        - \ref NVML_SUCCESS                         if the feature state was successfully set
+        - \ref NVML_ERROR_INVALID_ARGUMENT          if device is invalid or state is NULL
+        - \ref NVML_ERROR_NO_PERMISSION             if user does not have permission to change feature state
+        - \ref NVML_ERROR_NOT_SUPPORTED             if this feature is not supported by the device
+*/
+    fn nvmlDevicePowerSmoothingSetState(
         device: cuda_types::nvml::nvmlDevice_t,
-        status: *mut cuda_types::nvml::nvmlEccSramErrorStatus_t,
+        state: *mut cuda_types::nvml::nvmlPowerSmoothingState_t,
     ) -> cuda_types::nvml::nvmlReturn_t;
 }
diff --git a/cuda_types/src/cublas.rs b/cuda_types/src/cublas.rs
new file mode 100644
index 0000000..31adb7a
--- /dev/null
+++ b/cuda_types/src/cublas.rs
@@ -0,0 +1,324 @@
+// Generated automatically by zluda_bindgen
+// DO NOT EDIT MANUALLY
+#![allow(warnings)]
+pub type __half = u16;
+pub type __nv_bfloat16 = u16;
+pub use super::cuda::cuComplex;
+pub use super::cuda::cuDoubleComplex;
+pub use super::cuda::cudaDataType;
+pub use super::cuda::cudaDataType_t;
+pub type cudaStream_t = super::cuda::CUstream;
+pub use super::cuda::libraryPropertyType;
+pub type cudaGraphExecUpdateResultInfo_st = super::cuda::CUgraphExecUpdateResultInfo_st;
+pub type cudaAsyncNotificationType = super::cuda::CUasyncNotificationType_enum;
+pub type cudaGraph_t = super::cuda::CUgraph;
+pub const CUBLAS_VER_MAJOR: u32 = 12;
+pub const CUBLAS_VER_MINOR: u32 = 8;
+pub const CUBLAS_VER_PATCH: u32 = 4;
+pub const CUBLAS_VER_BUILD: u32 = 1;
+pub const CUBLAS_VERSION: u32 = 120804;
+impl cublasStatus_t {
+    pub const CUBLAS_STATUS_SUCCESS: cublasStatus_t = cublasStatus_t(0);
+}
+impl cublasStatus_t {
+    pub const CUBLAS_STATUS_NOT_INITIALIZED: cublasStatus_t = cublasStatus_t(1);
+}
+impl cublasStatus_t {
+    pub const CUBLAS_STATUS_ALLOC_FAILED: cublasStatus_t = cublasStatus_t(3);
+}
+impl cublasStatus_t {
+    pub const CUBLAS_STATUS_INVALID_VALUE: cublasStatus_t = cublasStatus_t(7);
+}
+impl cublasStatus_t {
+    pub const CUBLAS_STATUS_ARCH_MISMATCH: cublasStatus_t = cublasStatus_t(8);
+}
+impl cublasStatus_t {
+    pub const CUBLAS_STATUS_MAPPING_ERROR: cublasStatus_t = cublasStatus_t(11);
+}
+impl cublasStatus_t {
+    pub const CUBLAS_STATUS_EXECUTION_FAILED: cublasStatus_t = cublasStatus_t(13);
+}
+impl cublasStatus_t {
+    pub const CUBLAS_STATUS_INTERNAL_ERROR: cublasStatus_t = cublasStatus_t(14);
+}
+impl cublasStatus_t {
+    pub const CUBLAS_STATUS_NOT_SUPPORTED: cublasStatus_t = cublasStatus_t(15);
+}
+impl cublasStatus_t {
+    pub const CUBLAS_STATUS_LICENSE_ERROR: cublasStatus_t = cublasStatus_t(16);
+}
+#[repr(transparent)]
+#[must_use]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasStatus_t(pub ::core::ffi::c_uint);
+impl cublasFillMode_t {
+    pub const CUBLAS_FILL_MODE_LOWER: cublasFillMode_t = cublasFillMode_t(0);
+}
+impl cublasFillMode_t {
+    pub const CUBLAS_FILL_MODE_UPPER: cublasFillMode_t = cublasFillMode_t(1);
+}
+impl cublasFillMode_t {
+    pub const CUBLAS_FILL_MODE_FULL: cublasFillMode_t = cublasFillMode_t(2);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasFillMode_t(pub ::core::ffi::c_uint);
+impl cublasDiagType_t {
+    pub const CUBLAS_DIAG_NON_UNIT: cublasDiagType_t = cublasDiagType_t(0);
+}
+impl cublasDiagType_t {
+    pub const CUBLAS_DIAG_UNIT: cublasDiagType_t = cublasDiagType_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasDiagType_t(pub ::core::ffi::c_uint);
+impl cublasSideMode_t {
+    pub const CUBLAS_SIDE_LEFT: cublasSideMode_t = cublasSideMode_t(0);
+}
+impl cublasSideMode_t {
+    pub const CUBLAS_SIDE_RIGHT: cublasSideMode_t = cublasSideMode_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasSideMode_t(pub ::core::ffi::c_uint);
+impl cublasOperation_t {
+    pub const CUBLAS_OP_N: cublasOperation_t = cublasOperation_t(0);
+}
+impl cublasOperation_t {
+    pub const CUBLAS_OP_T: cublasOperation_t = cublasOperation_t(1);
+}
+impl cublasOperation_t {
+    pub const CUBLAS_OP_C: cublasOperation_t = cublasOperation_t(2);
+}
+impl cublasOperation_t {
+    pub const CUBLAS_OP_HERMITAN: cublasOperation_t = cublasOperation_t(2);
+}
+impl cublasOperation_t {
+    pub const CUBLAS_OP_CONJG: cublasOperation_t = cublasOperation_t(3);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasOperation_t(pub ::core::ffi::c_uint);
+impl cublasPointerMode_t {
+    pub const CUBLAS_POINTER_MODE_HOST: cublasPointerMode_t = cublasPointerMode_t(0);
+}
+impl cublasPointerMode_t {
+    pub const CUBLAS_POINTER_MODE_DEVICE: cublasPointerMode_t = cublasPointerMode_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasPointerMode_t(pub ::core::ffi::c_uint);
+impl cublasAtomicsMode_t {
+    pub const CUBLAS_ATOMICS_NOT_ALLOWED: cublasAtomicsMode_t = cublasAtomicsMode_t(0);
+}
+impl cublasAtomicsMode_t {
+    pub const CUBLAS_ATOMICS_ALLOWED: cublasAtomicsMode_t = cublasAtomicsMode_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasAtomicsMode_t(pub ::core::ffi::c_uint);
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_DFALT: cublasGemmAlgo_t = cublasGemmAlgo_t(-1);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_DEFAULT: cublasGemmAlgo_t = cublasGemmAlgo_t(-1);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO0: cublasGemmAlgo_t = cublasGemmAlgo_t(0);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO1: cublasGemmAlgo_t = cublasGemmAlgo_t(1);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO2: cublasGemmAlgo_t = cublasGemmAlgo_t(2);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO3: cublasGemmAlgo_t = cublasGemmAlgo_t(3);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO4: cublasGemmAlgo_t = cublasGemmAlgo_t(4);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO5: cublasGemmAlgo_t = cublasGemmAlgo_t(5);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO6: cublasGemmAlgo_t = cublasGemmAlgo_t(6);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO7: cublasGemmAlgo_t = cublasGemmAlgo_t(7);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO8: cublasGemmAlgo_t = cublasGemmAlgo_t(8);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO9: cublasGemmAlgo_t = cublasGemmAlgo_t(9);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO10: cublasGemmAlgo_t = cublasGemmAlgo_t(10);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO11: cublasGemmAlgo_t = cublasGemmAlgo_t(11);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO12: cublasGemmAlgo_t = cublasGemmAlgo_t(12);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO13: cublasGemmAlgo_t = cublasGemmAlgo_t(13);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO14: cublasGemmAlgo_t = cublasGemmAlgo_t(14);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO15: cublasGemmAlgo_t = cublasGemmAlgo_t(15);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO16: cublasGemmAlgo_t = cublasGemmAlgo_t(16);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO17: cublasGemmAlgo_t = cublasGemmAlgo_t(17);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO18: cublasGemmAlgo_t = cublasGemmAlgo_t(18);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO19: cublasGemmAlgo_t = cublasGemmAlgo_t(19);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO20: cublasGemmAlgo_t = cublasGemmAlgo_t(20);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO21: cublasGemmAlgo_t = cublasGemmAlgo_t(21);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO22: cublasGemmAlgo_t = cublasGemmAlgo_t(22);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO23: cublasGemmAlgo_t = cublasGemmAlgo_t(23);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_DEFAULT_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(99);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_DFALT_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(99);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO0_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(100);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO1_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(101);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO2_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(102);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO3_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(103);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO4_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(104);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO5_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(105);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO6_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(106);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO7_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(107);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO8_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(108);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO9_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(109);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO10_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(110);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO11_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(111);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO12_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(112);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO13_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(113);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO14_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(114);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO15_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(115);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasGemmAlgo_t(pub ::core::ffi::c_int);
+impl cublasMath_t {
+    pub const CUBLAS_DEFAULT_MATH: cublasMath_t = cublasMath_t(0);
+}
+impl cublasMath_t {
+    pub const CUBLAS_TENSOR_OP_MATH: cublasMath_t = cublasMath_t(1);
+}
+impl cublasMath_t {
+    pub const CUBLAS_PEDANTIC_MATH: cublasMath_t = cublasMath_t(2);
+}
+impl cublasMath_t {
+    pub const CUBLAS_TF32_TENSOR_OP_MATH: cublasMath_t = cublasMath_t(3);
+}
+impl cublasMath_t {
+    pub const CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION: cublasMath_t = cublasMath_t(
+        16,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasMath_t(pub ::core::ffi::c_uint);
+pub use super::cuda::cudaDataType as cublasDataType_t;
+impl cublasComputeType_t {
+    pub const CUBLAS_COMPUTE_16F: cublasComputeType_t = cublasComputeType_t(64);
+}
+impl cublasComputeType_t {
+    pub const CUBLAS_COMPUTE_16F_PEDANTIC: cublasComputeType_t = cublasComputeType_t(65);
+}
+impl cublasComputeType_t {
+    pub const CUBLAS_COMPUTE_32F: cublasComputeType_t = cublasComputeType_t(68);
+}
+impl cublasComputeType_t {
+    pub const CUBLAS_COMPUTE_32F_PEDANTIC: cublasComputeType_t = cublasComputeType_t(69);
+}
+impl cublasComputeType_t {
+    pub const CUBLAS_COMPUTE_32F_FAST_16F: cublasComputeType_t = cublasComputeType_t(74);
+}
+impl cublasComputeType_t {
+    pub const CUBLAS_COMPUTE_32F_FAST_16BF: cublasComputeType_t = cublasComputeType_t(
+        75,
+    );
+}
+impl cublasComputeType_t {
+    pub const CUBLAS_COMPUTE_32F_FAST_TF32: cublasComputeType_t = cublasComputeType_t(
+        77,
+    );
+}
+impl cublasComputeType_t {
+    pub const CUBLAS_COMPUTE_64F: cublasComputeType_t = cublasComputeType_t(70);
+}
+impl cublasComputeType_t {
+    pub const CUBLAS_COMPUTE_64F_PEDANTIC: cublasComputeType_t = cublasComputeType_t(71);
+}
+impl cublasComputeType_t {
+    pub const CUBLAS_COMPUTE_32I: cublasComputeType_t = cublasComputeType_t(72);
+}
+impl cublasComputeType_t {
+    pub const CUBLAS_COMPUTE_32I_PEDANTIC: cublasComputeType_t = cublasComputeType_t(73);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasComputeType_t(pub ::core::ffi::c_uint);
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cublasContext {
+    _unused: [u8; 0],
+}
+pub type cublasHandle_t = *mut cublasContext;
+pub type cublasLogCallback = ::core::option::Option<
+    unsafe extern "C" fn(msg: *const ::core::ffi::c_char),
+>;
diff --git a/cuda_types/src/cublaslt.rs b/cuda_types/src/cublaslt.rs
new file mode 100644
index 0000000..90df2f5
--- /dev/null
+++ b/cuda_types/src/cublaslt.rs
@@ -0,0 +1,5387 @@
+// Generated automatically by zluda_bindgen
+// DO NOT EDIT MANUALLY
+#![allow(warnings)]
+pub type __half = u16;
+pub type __nv_bfloat16 = u16;
+pub use super::cuda::cuComplex;
+pub use super::cuda::cuDoubleComplex;
+pub use super::cuda::cudaDataType;
+pub use super::cuda::cudaDataType_t;
+pub type cudaStream_t = super::cuda::CUstream;
+pub use super::cuda::libraryPropertyType;
+pub type cudaGraphExecUpdateResultInfo_st = super::cuda::CUgraphExecUpdateResultInfo_st;
+pub type cudaAsyncNotificationType = super::cuda::CUasyncNotificationType_enum;
+pub type cudaGraph_t = super::cuda::CUgraph;
+pub const CUBLASLT_NUMERICAL_IMPL_FLAGS_FMA: u32 = 1;
+pub const CUBLASLT_NUMERICAL_IMPL_FLAGS_HMMA: u32 = 2;
+pub const CUBLASLT_NUMERICAL_IMPL_FLAGS_IMMA: u32 = 4;
+pub const CUBLASLT_NUMERICAL_IMPL_FLAGS_DMMA: u32 = 8;
+pub const CUBLASLT_NUMERICAL_IMPL_FLAGS_TENSOR_OP_MASK: u32 = 254;
+pub const CUBLASLT_NUMERICAL_IMPL_FLAGS_OP_TYPE_MASK: u32 = 255;
+pub const CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_16F: u32 = 256;
+pub const CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_32F: u32 = 512;
+pub const CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_64F: u32 = 1024;
+pub const CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_32I: u32 = 2048;
+pub const CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_TYPE_MASK: u32 = 65280;
+pub const CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_16F: u32 = 65536;
+pub const CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_16BF: u32 = 131072;
+pub const CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_TF32: u32 = 262144;
+pub const CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_32F: u32 = 524288;
+pub const CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_64F: u32 = 1048576;
+pub const CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8I: u32 = 2097152;
+pub const CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8F_E4M3: u32 = 4194304;
+pub const CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8F_E5M2: u32 = 8388608;
+pub const CUBLASLT_NUMERICAL_IMPL_FLAGS_OP_INPUT_TYPE_MASK: u32 = 16711680;
+pub const CUBLASLT_NUMERICAL_IMPL_FLAGS_GAUSSIAN: u64 = 4294967296;
+impl cublasStatus_t {
+    pub const CUBLAS_STATUS_SUCCESS: cublasStatus_t = cublasStatus_t(0);
+}
+impl cublasStatus_t {
+    pub const CUBLAS_STATUS_NOT_INITIALIZED: cublasStatus_t = cublasStatus_t(1);
+}
+impl cublasStatus_t {
+    pub const CUBLAS_STATUS_ALLOC_FAILED: cublasStatus_t = cublasStatus_t(3);
+}
+impl cublasStatus_t {
+    pub const CUBLAS_STATUS_INVALID_VALUE: cublasStatus_t = cublasStatus_t(7);
+}
+impl cublasStatus_t {
+    pub const CUBLAS_STATUS_ARCH_MISMATCH: cublasStatus_t = cublasStatus_t(8);
+}
+impl cublasStatus_t {
+    pub const CUBLAS_STATUS_MAPPING_ERROR: cublasStatus_t = cublasStatus_t(11);
+}
+impl cublasStatus_t {
+    pub const CUBLAS_STATUS_EXECUTION_FAILED: cublasStatus_t = cublasStatus_t(13);
+}
+impl cublasStatus_t {
+    pub const CUBLAS_STATUS_INTERNAL_ERROR: cublasStatus_t = cublasStatus_t(14);
+}
+impl cublasStatus_t {
+    pub const CUBLAS_STATUS_NOT_SUPPORTED: cublasStatus_t = cublasStatus_t(15);
+}
+impl cublasStatus_t {
+    pub const CUBLAS_STATUS_LICENSE_ERROR: cublasStatus_t = cublasStatus_t(16);
+}
+#[repr(transparent)]
+#[must_use]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasStatus_t(pub ::core::ffi::c_uint);
+impl cublasFillMode_t {
+    pub const CUBLAS_FILL_MODE_LOWER: cublasFillMode_t = cublasFillMode_t(0);
+}
+impl cublasFillMode_t {
+    pub const CUBLAS_FILL_MODE_UPPER: cublasFillMode_t = cublasFillMode_t(1);
+}
+impl cublasFillMode_t {
+    pub const CUBLAS_FILL_MODE_FULL: cublasFillMode_t = cublasFillMode_t(2);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasFillMode_t(pub ::core::ffi::c_uint);
+impl cublasDiagType_t {
+    pub const CUBLAS_DIAG_NON_UNIT: cublasDiagType_t = cublasDiagType_t(0);
+}
+impl cublasDiagType_t {
+    pub const CUBLAS_DIAG_UNIT: cublasDiagType_t = cublasDiagType_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasDiagType_t(pub ::core::ffi::c_uint);
+impl cublasSideMode_t {
+    pub const CUBLAS_SIDE_LEFT: cublasSideMode_t = cublasSideMode_t(0);
+}
+impl cublasSideMode_t {
+    pub const CUBLAS_SIDE_RIGHT: cublasSideMode_t = cublasSideMode_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasSideMode_t(pub ::core::ffi::c_uint);
+impl cublasOperation_t {
+    pub const CUBLAS_OP_N: cublasOperation_t = cublasOperation_t(0);
+}
+impl cublasOperation_t {
+    pub const CUBLAS_OP_T: cublasOperation_t = cublasOperation_t(1);
+}
+impl cublasOperation_t {
+    pub const CUBLAS_OP_C: cublasOperation_t = cublasOperation_t(2);
+}
+impl cublasOperation_t {
+    pub const CUBLAS_OP_HERMITAN: cublasOperation_t = cublasOperation_t(2);
+}
+impl cublasOperation_t {
+    pub const CUBLAS_OP_CONJG: cublasOperation_t = cublasOperation_t(3);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasOperation_t(pub ::core::ffi::c_uint);
+impl cublasPointerMode_t {
+    pub const CUBLAS_POINTER_MODE_HOST: cublasPointerMode_t = cublasPointerMode_t(0);
+}
+impl cublasPointerMode_t {
+    pub const CUBLAS_POINTER_MODE_DEVICE: cublasPointerMode_t = cublasPointerMode_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasPointerMode_t(pub ::core::ffi::c_uint);
+impl cublasAtomicsMode_t {
+    pub const CUBLAS_ATOMICS_NOT_ALLOWED: cublasAtomicsMode_t = cublasAtomicsMode_t(0);
+}
+impl cublasAtomicsMode_t {
+    pub const CUBLAS_ATOMICS_ALLOWED: cublasAtomicsMode_t = cublasAtomicsMode_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasAtomicsMode_t(pub ::core::ffi::c_uint);
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_DFALT: cublasGemmAlgo_t = cublasGemmAlgo_t(-1);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_DEFAULT: cublasGemmAlgo_t = cublasGemmAlgo_t(-1);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO0: cublasGemmAlgo_t = cublasGemmAlgo_t(0);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO1: cublasGemmAlgo_t = cublasGemmAlgo_t(1);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO2: cublasGemmAlgo_t = cublasGemmAlgo_t(2);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO3: cublasGemmAlgo_t = cublasGemmAlgo_t(3);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO4: cublasGemmAlgo_t = cublasGemmAlgo_t(4);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO5: cublasGemmAlgo_t = cublasGemmAlgo_t(5);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO6: cublasGemmAlgo_t = cublasGemmAlgo_t(6);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO7: cublasGemmAlgo_t = cublasGemmAlgo_t(7);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO8: cublasGemmAlgo_t = cublasGemmAlgo_t(8);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO9: cublasGemmAlgo_t = cublasGemmAlgo_t(9);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO10: cublasGemmAlgo_t = cublasGemmAlgo_t(10);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO11: cublasGemmAlgo_t = cublasGemmAlgo_t(11);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO12: cublasGemmAlgo_t = cublasGemmAlgo_t(12);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO13: cublasGemmAlgo_t = cublasGemmAlgo_t(13);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO14: cublasGemmAlgo_t = cublasGemmAlgo_t(14);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO15: cublasGemmAlgo_t = cublasGemmAlgo_t(15);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO16: cublasGemmAlgo_t = cublasGemmAlgo_t(16);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO17: cublasGemmAlgo_t = cublasGemmAlgo_t(17);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO18: cublasGemmAlgo_t = cublasGemmAlgo_t(18);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO19: cublasGemmAlgo_t = cublasGemmAlgo_t(19);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO20: cublasGemmAlgo_t = cublasGemmAlgo_t(20);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO21: cublasGemmAlgo_t = cublasGemmAlgo_t(21);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO22: cublasGemmAlgo_t = cublasGemmAlgo_t(22);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO23: cublasGemmAlgo_t = cublasGemmAlgo_t(23);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_DEFAULT_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(99);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_DFALT_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(99);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO0_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(100);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO1_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(101);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO2_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(102);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO3_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(103);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO4_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(104);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO5_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(105);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO6_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(106);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO7_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(107);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO8_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(108);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO9_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(109);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO10_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(110);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO11_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(111);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO12_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(112);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO13_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(113);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO14_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(114);
+}
+impl cublasGemmAlgo_t {
+    pub const CUBLAS_GEMM_ALGO15_TENSOR_OP: cublasGemmAlgo_t = cublasGemmAlgo_t(115);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasGemmAlgo_t(pub ::core::ffi::c_int);
+impl cublasMath_t {
+    pub const CUBLAS_DEFAULT_MATH: cublasMath_t = cublasMath_t(0);
+}
+impl cublasMath_t {
+    pub const CUBLAS_TENSOR_OP_MATH: cublasMath_t = cublasMath_t(1);
+}
+impl cublasMath_t {
+    pub const CUBLAS_PEDANTIC_MATH: cublasMath_t = cublasMath_t(2);
+}
+impl cublasMath_t {
+    pub const CUBLAS_TF32_TENSOR_OP_MATH: cublasMath_t = cublasMath_t(3);
+}
+impl cublasMath_t {
+    pub const CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION: cublasMath_t = cublasMath_t(
+        16,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasMath_t(pub ::core::ffi::c_uint);
+pub use super::cuda::cudaDataType as cublasDataType_t;
+impl cublasComputeType_t {
+    pub const CUBLAS_COMPUTE_16F: cublasComputeType_t = cublasComputeType_t(64);
+}
+impl cublasComputeType_t {
+    pub const CUBLAS_COMPUTE_16F_PEDANTIC: cublasComputeType_t = cublasComputeType_t(65);
+}
+impl cublasComputeType_t {
+    pub const CUBLAS_COMPUTE_32F: cublasComputeType_t = cublasComputeType_t(68);
+}
+impl cublasComputeType_t {
+    pub const CUBLAS_COMPUTE_32F_PEDANTIC: cublasComputeType_t = cublasComputeType_t(69);
+}
+impl cublasComputeType_t {
+    pub const CUBLAS_COMPUTE_32F_FAST_16F: cublasComputeType_t = cublasComputeType_t(74);
+}
+impl cublasComputeType_t {
+    pub const CUBLAS_COMPUTE_32F_FAST_16BF: cublasComputeType_t = cublasComputeType_t(
+        75,
+    );
+}
+impl cublasComputeType_t {
+    pub const CUBLAS_COMPUTE_32F_FAST_TF32: cublasComputeType_t = cublasComputeType_t(
+        77,
+    );
+}
+impl cublasComputeType_t {
+    pub const CUBLAS_COMPUTE_64F: cublasComputeType_t = cublasComputeType_t(70);
+}
+impl cublasComputeType_t {
+    pub const CUBLAS_COMPUTE_64F_PEDANTIC: cublasComputeType_t = cublasComputeType_t(71);
+}
+impl cublasComputeType_t {
+    pub const CUBLAS_COMPUTE_32I: cublasComputeType_t = cublasComputeType_t(72);
+}
+impl cublasComputeType_t {
+    pub const CUBLAS_COMPUTE_32I_PEDANTIC: cublasComputeType_t = cublasComputeType_t(73);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasComputeType_t(pub ::core::ffi::c_uint);
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cublasContext {
+    _unused: [u8; 0],
+}
+pub type cublasHandle_t = *mut cublasContext;
+pub type cublasLogCallback = ::core::option::Option<
+    unsafe extern "C" fn(msg: *const ::core::ffi::c_char),
+>;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cublasLtContext {
+    _unused: [u8; 0],
+}
+/// Opaque structure holding CUBLASLT context
+pub type cublasLtHandle_t = *mut cublasLtContext;
+/// Semi-opaque descriptor for matrix memory layout
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasLtMatrixLayoutOpaque_t {
+    pub data: [u64; 8usize],
+}
+/// Opaque descriptor for matrix memory layout
+pub type cublasLtMatrixLayout_t = *mut cublasLtMatrixLayoutOpaque_t;
+/** Semi-opaque algorithm descriptor (to avoid complicated alloc/free schemes)
+
+ This structure can be trivially serialized and later restored for use with the same version of cuBLAS library to save
+ on selecting the right configuration again.*/
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasLtMatmulAlgo_t {
+    pub data: [u64; 8usize],
+}
+/// Semi-opaque descriptor for cublasLtMatmul() operation details
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasLtMatmulDescOpaque_t {
+    pub data: [u64; 32usize],
+}
+/// Opaque descriptor for cublasLtMatmul() operation details
+pub type cublasLtMatmulDesc_t = *mut cublasLtMatmulDescOpaque_t;
+/// Semi-opaque descriptor for cublasLtMatrixTransform() operation details
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasLtMatrixTransformDescOpaque_t {
+    pub data: [u64; 8usize],
+}
+/// Opaque descriptor for cublasLtMatrixTransform() operation details
+pub type cublasLtMatrixTransformDesc_t = *mut cublasLtMatrixTransformDescOpaque_t;
+/// Semi-opaque descriptor for cublasLtMatmulPreference() operation details
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasLtMatmulPreferenceOpaque_t {
+    pub data: [u64; 8usize],
+}
+/// Opaque descriptor for cublasLtMatmulAlgoGetHeuristic() configuration
+pub type cublasLtMatmulPreference_t = *mut cublasLtMatmulPreferenceOpaque_t;
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_UNDEFINED: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        0,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_8x8: cublasLtMatmulTile_t = cublasLtMatmulTile_t(1);
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_8x16: cublasLtMatmulTile_t = cublasLtMatmulTile_t(2);
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_16x8: cublasLtMatmulTile_t = cublasLtMatmulTile_t(3);
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_8x32: cublasLtMatmulTile_t = cublasLtMatmulTile_t(4);
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_16x16: cublasLtMatmulTile_t = cublasLtMatmulTile_t(5);
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_32x8: cublasLtMatmulTile_t = cublasLtMatmulTile_t(6);
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_8x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(7);
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_16x32: cublasLtMatmulTile_t = cublasLtMatmulTile_t(8);
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_32x16: cublasLtMatmulTile_t = cublasLtMatmulTile_t(9);
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x8: cublasLtMatmulTile_t = cublasLtMatmulTile_t(10);
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_32x32: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        11,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_32x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        12,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x32: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        13,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_32x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        14,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        15,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x32: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        16,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        17,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        18,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        19,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        20,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        21,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x512: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        22,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        23,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        24,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_512x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        25,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x96: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        26,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_96x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        27,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_96x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        28,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x160: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        29,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_160x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        30,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        31,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        32,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x96: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        33,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_32x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        34,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x32: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        35,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_8x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        36,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_8x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        37,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_8x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        38,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_8x320: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        39,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_8x384: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        40,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_8x448: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        41,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_8x512: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        42,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_8x576: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        43,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_8x640: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        44,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_8x704: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        45,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_8x768: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        46,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_16x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        47,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_16x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        48,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_16x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        49,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_16x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        50,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_16x320: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        51,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_16x384: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        52,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_16x448: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        53,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_16x512: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        54,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_16x576: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        55,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_16x640: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        56,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_16x704: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        57,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_16x768: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        58,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_24x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        59,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_24x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        60,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_24x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        61,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_24x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        62,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_24x320: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        63,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_24x384: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        64,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_24x448: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        65,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_24x512: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        66,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_24x576: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        67,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_24x640: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        68,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_24x704: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        69,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_24x768: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        70,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_32x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        71,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_32x320: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        72,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_32x384: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        73,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_32x448: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        74,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_32x512: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        75,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_32x576: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        76,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_32x640: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        77,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_32x704: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        78,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_32x768: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        79,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_40x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        80,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_40x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        81,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_40x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        82,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_40x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        83,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_40x320: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        84,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_40x384: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        85,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_40x448: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        86,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_40x512: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        87,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_40x576: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        88,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_40x640: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        89,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_40x704: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        90,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_40x768: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        91,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_48x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        92,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_48x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        93,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_48x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        94,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_48x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        95,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_48x320: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        96,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_48x384: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        97,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_48x448: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        98,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_48x512: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        99,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_48x576: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        100,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_48x640: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        101,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_48x704: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        102,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_48x768: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        103,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_56x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        104,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_56x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        105,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_56x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        106,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_56x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        107,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_56x320: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        108,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_56x384: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        109,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_56x448: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        110,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_56x512: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        111,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_56x576: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        112,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_56x640: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        113,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_56x704: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        114,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_56x768: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        115,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        116,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x320: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        117,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x384: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        118,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x448: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        119,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x576: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        120,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x640: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        121,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x704: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        122,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x768: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        123,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_72x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        124,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_72x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        125,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_72x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        126,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_72x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        127,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_72x320: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        128,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_72x384: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        129,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_72x448: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        130,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_72x512: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        131,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_72x576: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        132,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_72x640: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        133,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_80x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        134,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_80x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        135,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_80x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        136,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_80x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        137,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_80x320: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        138,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_80x384: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        139,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_80x448: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        140,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_80x512: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        141,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_80x576: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        142,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_88x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        143,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_88x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        144,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_88x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        145,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_88x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        146,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_88x320: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        147,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_88x384: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        148,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_88x448: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        149,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_88x512: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        150,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_96x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        151,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_96x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        152,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_96x320: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        153,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_96x384: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        154,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_96x448: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        155,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_96x512: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        156,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_104x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        157,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_104x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        158,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_104x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        159,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_104x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        160,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_104x320: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        161,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_104x384: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        162,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_104x448: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        163,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_112x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        164,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_112x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        165,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_112x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        166,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_112x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        167,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_112x320: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        168,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_112x384: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        169,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_120x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        170,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_120x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        171,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_120x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        172,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_120x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        173,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_120x320: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        174,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_120x384: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        175,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x320: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        176,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x384: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        177,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_136x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        178,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_136x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        179,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_136x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        180,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_136x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        181,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_136x320: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        182,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_144x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        183,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_144x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        184,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_144x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        185,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_144x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        186,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_144x320: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        187,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_152x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        188,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_152x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        189,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_152x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        190,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_152x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        191,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_152x320: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        192,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_160x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        193,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_160x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        194,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_160x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        195,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_168x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        196,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_168x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        197,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_168x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        198,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_168x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        199,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_176x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        200,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_176x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        201,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_176x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        202,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_176x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        203,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_184x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        204,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_184x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        205,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_184x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        206,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_184x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        207,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        208,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        209,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        210,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_200x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        211,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_200x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        212,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_200x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        213,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_208x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        214,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_208x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        215,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_208x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        216,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_216x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        217,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_216x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        218,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_216x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        219,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_224x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        220,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_224x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        221,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_224x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        222,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_232x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        223,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_232x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        224,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_232x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        225,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_240x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        226,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_240x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        227,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_240x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        228,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_248x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        229,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_248x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        230,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_248x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        231,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        232,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_264x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        233,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_264x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        234,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_272x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        235,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_272x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        236,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_280x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        237,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_280x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        238,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_288x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        239,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_288x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        240,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_296x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        241,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_296x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        242,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_304x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        243,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_304x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        244,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_312x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        245,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_312x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        246,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        247,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        248,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_328x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        249,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_328x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        250,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_336x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        251,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_336x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        252,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_344x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        253,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_344x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        254,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_352x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        255,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_352x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        256,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_360x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        257,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_360x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        258,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_368x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        259,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_368x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        260,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_376x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        261,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_376x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        262,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_384x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        263,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_384x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        264,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_392x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        265,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_400x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        266,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_408x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        267,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_416x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        268,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_424x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        269,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_432x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        270,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_440x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        271,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_448x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        272,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_456x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        273,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_464x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        274,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_472x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        275,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_480x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        276,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_488x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        277,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_496x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        278,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_504x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        279,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_520x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        280,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_528x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        281,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_536x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        282,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_544x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        283,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_552x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        284,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_560x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        285,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_568x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        286,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_576x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        287,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_584x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        288,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_592x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        289,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_600x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        290,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_608x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        291,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_616x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        292,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_624x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        293,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_632x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        294,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_640x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        295,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_648x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        296,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_656x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        297,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_664x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        298,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_672x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        299,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_680x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        300,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_688x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        301,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_696x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        302,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_704x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        303,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_712x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        304,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_720x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        305,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_728x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        306,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_736x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        307,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_744x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        308,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_752x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        309,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_760x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        310,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_768x64: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        311,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x16: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        312,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x24: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        313,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x40: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        314,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x48: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        315,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x56: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        316,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x72: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        317,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x80: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        318,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x88: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        319,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x104: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        320,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x112: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        321,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x120: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        322,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x136: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        323,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x144: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        324,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x152: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        325,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x160: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        326,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x168: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        327,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x176: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        328,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x184: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        329,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x200: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        330,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x208: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        331,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x216: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        332,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x224: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        333,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x232: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        334,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x240: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        335,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x248: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        336,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x264: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        337,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x272: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        338,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x280: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        339,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x288: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        340,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x296: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        341,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x304: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        342,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x312: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        343,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x328: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        344,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x336: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        345,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x344: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        346,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x352: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        347,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x360: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        348,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x368: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        349,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x376: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        350,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x392: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        351,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x400: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        352,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x408: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        353,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x416: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        354,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x424: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        355,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x432: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        356,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x440: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        357,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x456: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        358,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x464: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        359,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x472: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        360,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x480: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        361,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x488: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        362,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x496: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        363,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x504: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        364,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x520: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        365,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x528: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        366,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x536: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        367,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x544: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        368,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x552: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        369,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x560: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        370,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x568: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        371,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x584: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        372,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x592: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        373,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x600: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        374,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x608: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        375,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x616: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        376,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x624: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        377,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x632: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        378,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x648: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        379,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x656: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        380,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x664: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        381,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x672: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        382,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x680: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        383,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x688: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        384,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x696: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        385,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x712: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        386,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x720: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        387,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x728: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        388,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x736: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        389,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x744: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        390,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x752: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        391,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_64x760: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        392,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x8: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        393,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x16: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        394,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x24: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        395,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x40: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        396,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x48: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        397,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x56: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        398,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x72: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        399,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x80: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        400,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x88: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        401,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x104: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        402,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x112: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        403,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x120: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        404,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x136: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        405,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x144: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        406,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x152: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        407,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x168: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        408,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x176: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        409,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x184: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        410,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x200: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        411,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x208: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        412,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x216: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        413,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x224: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        414,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x232: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        415,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x240: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        416,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x248: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        417,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x264: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        418,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x272: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        419,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x280: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        420,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x288: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        421,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x296: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        422,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x304: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        423,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x312: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        424,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x328: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        425,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x336: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        426,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x344: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        427,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x352: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        428,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x360: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        429,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x368: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        430,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x376: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        431,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x392: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        432,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x400: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        433,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x408: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        434,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x416: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        435,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x424: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        436,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x432: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        437,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x440: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        438,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x448: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        439,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x456: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        440,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x464: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        441,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x472: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        442,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x480: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        443,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x488: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        444,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x496: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        445,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x504: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        446,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_128x512: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        447,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x8: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        448,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x16: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        449,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x24: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        450,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x32: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        451,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x40: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        452,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x48: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        453,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x56: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        454,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x72: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        455,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x80: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        456,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x88: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        457,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x96: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        458,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x104: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        459,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x112: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        460,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x120: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        461,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x136: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        462,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x144: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        463,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x152: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        464,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x160: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        465,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x168: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        466,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x176: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        467,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x184: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        468,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x200: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        469,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x208: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        470,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x216: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        471,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x224: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        472,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x232: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        473,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x240: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        474,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x248: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        475,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x264: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        476,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x272: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        477,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x280: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        478,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x288: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        479,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x296: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        480,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x304: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        481,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x312: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        482,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x320: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        483,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x328: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        484,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_192x336: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        485,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x8: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        486,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x16: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        487,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x24: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        488,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x40: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        489,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x48: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        490,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x56: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        491,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x72: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        492,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x80: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        493,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x88: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        494,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x96: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        495,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x104: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        496,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x112: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        497,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x120: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        498,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x136: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        499,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x144: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        500,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x152: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        501,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x160: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        502,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x168: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        503,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x176: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        504,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x184: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        505,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x200: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        506,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x208: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        507,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x216: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        508,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x224: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        509,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x232: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        510,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x240: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        511,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x248: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        512,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x256: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        513,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x8: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        514,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x16: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        515,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x24: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        516,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x32: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        517,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x40: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        518,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x48: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        519,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x56: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        520,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x72: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        521,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x80: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        522,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x88: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        523,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x96: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        524,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x104: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        525,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x112: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        526,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x120: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        527,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x136: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        528,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x144: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        529,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x152: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        530,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x160: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        531,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x168: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        532,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x176: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        533,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x184: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        534,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x192: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        535,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_320x200: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        536,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_384x8: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        537,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_384x16: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        538,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_384x24: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        539,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_384x32: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        540,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_384x40: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        541,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_384x48: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        542,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_384x56: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        543,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_384x72: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        544,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_384x80: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        545,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_384x88: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        546,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_384x96: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        547,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_384x104: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        548,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_384x112: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        549,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_384x120: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        550,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_384x136: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        551,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_384x144: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        552,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_384x152: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        553,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_384x160: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        554,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_384x168: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        555,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_448x8: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        556,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_448x16: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        557,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_448x24: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        558,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_448x32: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        559,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_448x40: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        560,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_448x48: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        561,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_448x56: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        562,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_448x72: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        563,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_448x80: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        564,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_448x88: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        565,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_448x96: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        566,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_448x104: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        567,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_448x112: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        568,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_448x120: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        569,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_448x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        570,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_448x136: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        571,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_448x144: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        572,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_512x8: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        573,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_512x16: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        574,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_512x24: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        575,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_512x32: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        576,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_512x40: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        577,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_512x48: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        578,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_512x56: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        579,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_512x72: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        580,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_512x80: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        581,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_512x88: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        582,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_512x96: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        583,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_512x104: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        584,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_512x112: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        585,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_512x120: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        586,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_512x128: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        587,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_576x8: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        588,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_576x16: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        589,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_576x24: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        590,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_576x32: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        591,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_576x40: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        592,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_576x48: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        593,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_576x56: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        594,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_576x72: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        595,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_576x80: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        596,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_576x88: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        597,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_576x96: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        598,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_576x104: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        599,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_576x112: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        600,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_640x8: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        601,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_640x16: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        602,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_640x24: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        603,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_640x32: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        604,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_640x40: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        605,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_640x48: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        606,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_640x56: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        607,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_640x72: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        608,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_640x80: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        609,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_640x88: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        610,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_640x96: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        611,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_704x8: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        612,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_704x16: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        613,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_704x24: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        614,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_704x32: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        615,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_704x40: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        616,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_704x48: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        617,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_704x56: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        618,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_704x72: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        619,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_704x80: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        620,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_704x88: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        621,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_768x8: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        622,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_768x16: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        623,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_768x24: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        624,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_768x32: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        625,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_768x40: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        626,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_768x48: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        627,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_768x56: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        628,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_768x72: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        629,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_768x80: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        630,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x512: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        631,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_256x1024: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        632,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_512x512: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        633,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_512x1024: cublasLtMatmulTile_t = cublasLtMatmulTile_t(
+        634,
+    );
+}
+impl cublasLtMatmulTile_t {
+    pub const CUBLASLT_MATMUL_TILE_END: cublasLtMatmulTile_t = cublasLtMatmulTile_t(635);
+}
+#[repr(transparent)]
+/** Tile size (in C/D matrix Rows x Cols)
+
+ General order of tile IDs is sorted by size first and by first dimension second.*/
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasLtMatmulTile_t(pub ::core::ffi::c_uint);
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_UNDEFINED: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        0,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_16x1: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        1,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_16x2: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        2,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_16x3: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        3,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_16x4: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        4,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_16x5: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        5,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_16x6: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        6,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_32x1: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        7,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_32x2: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        8,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_32x3: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        9,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_32x4: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        10,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_32x5: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        11,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_32x6: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        12,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_64x1: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        13,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_64x2: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        14,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_64x3: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        15,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_64x4: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        16,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_64x5: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        17,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_64x6: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        18,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_128x1: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        19,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_128x2: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        20,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_128x3: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        21,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_128x4: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        22,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_128x5: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        23,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_128x6: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        24,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_32x10: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        25,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_8x4: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        26,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_16x10: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        27,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_8x5: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        28,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_8x3: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        31,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_8xAUTO: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        32,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_16xAUTO: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        33,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_32xAUTO: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        34,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_64xAUTO: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        35,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_128xAUTO: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        36,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_256xAUTO: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        37,
+    );
+}
+impl cublasLtMatmulStages_t {
+    pub const CUBLASLT_MATMUL_STAGES_END: cublasLtMatmulStages_t = cublasLtMatmulStages_t(
+        38,
+    );
+}
+#[repr(transparent)]
+/** Size and number of stages in which elements are read into shared memory
+
+ General order of stages IDs is sorted by stage size first and by number of stages second.*/
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasLtMatmulStages_t(pub ::core::ffi::c_uint);
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_AUTO: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        0,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_1x1x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        2,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_2x1x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        3,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_4x1x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        4,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_1x2x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        5,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_2x2x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        6,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_4x2x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        7,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_1x4x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        8,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_2x4x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        9,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_4x4x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        10,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_8x1x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        11,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_1x8x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        12,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_8x2x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        13,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_2x8x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        14,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_16x1x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        15,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_1x16x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        16,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_3x1x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        17,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_5x1x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        18,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_6x1x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        19,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_7x1x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        20,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_9x1x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        21,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_10x1x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        22,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_11x1x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        23,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_12x1x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        24,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_13x1x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        25,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_14x1x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        26,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_15x1x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        27,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_3x2x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        28,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_5x2x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        29,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_6x2x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        30,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_7x2x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        31,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_1x3x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        32,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_2x3x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        33,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_3x3x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        34,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_4x3x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        35,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_5x3x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        36,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_3x4x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        37,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_1x5x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        38,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_2x5x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        39,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_3x5x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        40,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_1x6x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        41,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_2x6x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        42,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_1x7x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        43,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_2x7x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        44,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_1x9x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        45,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_1x10x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        46,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_1x11x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        47,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_1x12x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        48,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_1x13x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        49,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_1x14x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        50,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_1x15x1: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        51,
+    );
+}
+impl cublasLtClusterShape_t {
+    /// Let library pick cluster shape automatically
+    pub const CUBLASLT_CLUSTER_SHAPE_END: cublasLtClusterShape_t = cublasLtClusterShape_t(
+        52,
+    );
+}
+#[repr(transparent)]
+/** Thread Block Cluster size
+
+ Typically dimensioned similar to cublasLtMatmulTile_t, with the third coordinate unused at this time.*/
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasLtClusterShape_t(pub ::core::ffi::c_uint);
+impl cublasLtMatmulInnerShape_t {
+    pub const CUBLASLT_MATMUL_INNER_SHAPE_UNDEFINED: cublasLtMatmulInnerShape_t = cublasLtMatmulInnerShape_t(
+        0,
+    );
+}
+impl cublasLtMatmulInnerShape_t {
+    pub const CUBLASLT_MATMUL_INNER_SHAPE_MMA884: cublasLtMatmulInnerShape_t = cublasLtMatmulInnerShape_t(
+        1,
+    );
+}
+impl cublasLtMatmulInnerShape_t {
+    pub const CUBLASLT_MATMUL_INNER_SHAPE_MMA1684: cublasLtMatmulInnerShape_t = cublasLtMatmulInnerShape_t(
+        2,
+    );
+}
+impl cublasLtMatmulInnerShape_t {
+    pub const CUBLASLT_MATMUL_INNER_SHAPE_MMA1688: cublasLtMatmulInnerShape_t = cublasLtMatmulInnerShape_t(
+        3,
+    );
+}
+impl cublasLtMatmulInnerShape_t {
+    pub const CUBLASLT_MATMUL_INNER_SHAPE_MMA16816: cublasLtMatmulInnerShape_t = cublasLtMatmulInnerShape_t(
+        4,
+    );
+}
+impl cublasLtMatmulInnerShape_t {
+    pub const CUBLASLT_MATMUL_INNER_SHAPE_END: cublasLtMatmulInnerShape_t = cublasLtMatmulInnerShape_t(
+        5,
+    );
+}
+#[repr(transparent)]
+/** Inner size of the kernel
+
+ Represents various aspects of internal kernel design, that don't impact CUDA grid size but may have other more subtle
+ effects.
+*/
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasLtMatmulInnerShape_t(pub ::core::ffi::c_uint);
+impl cublasLtMatmulMatrixScale_t {
+    /// Scaling factors are single precision scalars applied to the whole tensor
+    pub const CUBLASLT_MATMUL_MATRIX_SCALE_SCALAR_32F: cublasLtMatmulMatrixScale_t = cublasLtMatmulMatrixScale_t(
+        0,
+    );
+}
+impl cublasLtMatmulMatrixScale_t {
+    /** Scaling factors are tensors that contain a dedicated scaling factor stored as an 8-bit CUDA_R_8F_UE4M3 value for
+each 16-element block in the innermost dimension of the corresponding data tensor*/
+    pub const CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3: cublasLtMatmulMatrixScale_t = cublasLtMatmulMatrixScale_t(
+        1,
+    );
+}
+impl cublasLtMatmulMatrixScale_t {
+    /** Same as above, except that scaling factor tensor elements have type CUDA_R_8F_UE8M0 and the block size is 32
+elements*/
+    pub const CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0: cublasLtMatmulMatrixScale_t = cublasLtMatmulMatrixScale_t(
+        2,
+    );
+}
+impl cublasLtMatmulMatrixScale_t {
+    /** Same as above, except that scaling factor tensor elements have type CUDA_R_8F_UE8M0 and the block size is 32
+elements*/
+    pub const CUBLASLT_MATMUL_MATRIX_SCALE_END: cublasLtMatmulMatrixScale_t = cublasLtMatmulMatrixScale_t(
+        3,
+    );
+}
+#[repr(transparent)]
+/// Scaling mode for per-matrix scaling
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasLtMatmulMatrixScale_t(pub ::core::ffi::c_uint);
+impl cublasLtPointerMode_t {
+    /// matches CUBLAS_POINTER_MODE_HOST, pointer targets a single value host memory
+    pub const CUBLASLT_POINTER_MODE_HOST: cublasLtPointerMode_t = cublasLtPointerMode_t(
+        0,
+    );
+}
+impl cublasLtPointerMode_t {
+    /// matches CUBLAS_POINTER_MODE_DEVICE, pointer targets a single value device memory
+    pub const CUBLASLT_POINTER_MODE_DEVICE: cublasLtPointerMode_t = cublasLtPointerMode_t(
+        1,
+    );
+}
+impl cublasLtPointerMode_t {
+    /// pointer targets an array in device memory
+    pub const CUBLASLT_POINTER_MODE_DEVICE_VECTOR: cublasLtPointerMode_t = cublasLtPointerMode_t(
+        2,
+    );
+}
+impl cublasLtPointerMode_t {
+    /** alpha pointer targets an array in device memory, beta is zero. Note:
+CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE is not supported, must be 0.*/
+    pub const CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO: cublasLtPointerMode_t = cublasLtPointerMode_t(
+        3,
+    );
+}
+impl cublasLtPointerMode_t {
+    /// alpha pointer targets an array in device memory, beta is a single value in host memory.
+    pub const CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST: cublasLtPointerMode_t = cublasLtPointerMode_t(
+        4,
+    );
+}
+#[repr(transparent)]
+/// Pointer mode to use for alpha/beta
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasLtPointerMode_t(pub ::core::ffi::c_uint);
+impl cublasLtPointerModeMask_t {
+    /// see CUBLASLT_POINTER_MODE_HOST
+    pub const CUBLASLT_POINTER_MODE_MASK_HOST: cublasLtPointerModeMask_t = cublasLtPointerModeMask_t(
+        1,
+    );
+}
+impl cublasLtPointerModeMask_t {
+    /// see CUBLASLT_POINTER_MODE_DEVICE
+    pub const CUBLASLT_POINTER_MODE_MASK_DEVICE: cublasLtPointerModeMask_t = cublasLtPointerModeMask_t(
+        2,
+    );
+}
+impl cublasLtPointerModeMask_t {
+    /// see CUBLASLT_POINTER_MODE_DEVICE_VECTOR
+    pub const CUBLASLT_POINTER_MODE_MASK_DEVICE_VECTOR: cublasLtPointerModeMask_t = cublasLtPointerModeMask_t(
+        4,
+    );
+}
+impl cublasLtPointerModeMask_t {
+    /// see CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO
+    pub const CUBLASLT_POINTER_MODE_MASK_ALPHA_DEVICE_VECTOR_BETA_ZERO: cublasLtPointerModeMask_t = cublasLtPointerModeMask_t(
+        8,
+    );
+}
+impl cublasLtPointerModeMask_t {
+    /// see CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST
+    pub const CUBLASLT_POINTER_MODE_MASK_ALPHA_DEVICE_VECTOR_BETA_HOST: cublasLtPointerModeMask_t = cublasLtPointerModeMask_t(
+        16,
+    );
+}
+#[repr(transparent)]
+/// Mask to define pointer mode capability
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasLtPointerModeMask_t(pub ::core::ffi::c_uint);
+pub type cublasLtNumericalImplFlags_t = u64;
+impl cublasLtOrder_t {
+    /** Column-major
+
+ Leading dimension is the stride (in elements) to the beginning of next column in memory.*/
+    pub const CUBLASLT_ORDER_COL: cublasLtOrder_t = cublasLtOrder_t(0);
+}
+impl cublasLtOrder_t {
+    /** Row major
+
+ Leading dimension is the stride (in elements) to the beginning of next row in memory.*/
+    pub const CUBLASLT_ORDER_ROW: cublasLtOrder_t = cublasLtOrder_t(1);
+}
+impl cublasLtOrder_t {
+    /** Column-major ordered tiles of 32 columns.
+
+ Leading dimension is the stride (in elements) to the beginning of next group of 32-columns. E.g. if matrix has 33
+ columns and 2 rows, ld must be at least (32) * 2 = 64.*/
+    pub const CUBLASLT_ORDER_COL32: cublasLtOrder_t = cublasLtOrder_t(2);
+}
+impl cublasLtOrder_t {
+    /** Column-major ordered tiles of composite tiles with total 32 columns and 8 rows, tile composed of interleaved
+ inner tiles of 4 columns within 4 even or odd rows in an alternating pattern.
+
+ Leading dimension is the stride (in elements) to the beginning of the first 32 column x 8 row tile for the next
+ 32-wide group of columns. E.g. if matrix has 33 columns and 1 row, ld must be at least (32 * 8) * 1 = 256.*/
+    pub const CUBLASLT_ORDER_COL4_4R2_8C: cublasLtOrder_t = cublasLtOrder_t(3);
+}
+impl cublasLtOrder_t {
+    /** Column-major ordered tiles of composite tiles with total 32 columns ands 32 rows.
+ Element offset within the tile is calculated as (((row%8)/2*4+row/8)*2+row%2)*32+col.
+
+ Leading dimension is the stride (in elements) to the beginning of the first 32 column x 32 row tile for the next
+ 32-wide group of columns. E.g. if matrix has 33 columns and 1 row, ld must be at least (32*32)*1 = 1024.*/
+    pub const CUBLASLT_ORDER_COL32_2R_4R4: cublasLtOrder_t = cublasLtOrder_t(4);
+}
+#[repr(transparent)]
+/// Enum for data ordering
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasLtOrder_t(pub ::core::ffi::c_uint);
+impl cublasLtMatrixLayoutAttribute_t {
+    /** Data type, see cudaDataType.
+
+ uint32_t*/
+    pub const CUBLASLT_MATRIX_LAYOUT_TYPE: cublasLtMatrixLayoutAttribute_t = cublasLtMatrixLayoutAttribute_t(
+        0,
+    );
+}
+impl cublasLtMatrixLayoutAttribute_t {
+    /** Memory order of the data, see cublasLtOrder_t.
+
+ int32_t, default: CUBLASLT_ORDER_COL*/
+    pub const CUBLASLT_MATRIX_LAYOUT_ORDER: cublasLtMatrixLayoutAttribute_t = cublasLtMatrixLayoutAttribute_t(
+        1,
+    );
+}
+impl cublasLtMatrixLayoutAttribute_t {
+    /** Number of rows.
+
+ Usually only values that can be expressed as int32_t are supported.
+
+ uint64_t*/
+    pub const CUBLASLT_MATRIX_LAYOUT_ROWS: cublasLtMatrixLayoutAttribute_t = cublasLtMatrixLayoutAttribute_t(
+        2,
+    );
+}
+impl cublasLtMatrixLayoutAttribute_t {
+    /** Number of columns.
+
+ Usually only values that can be expressed as int32_t are supported.
+
+ uint64_t*/
+    pub const CUBLASLT_MATRIX_LAYOUT_COLS: cublasLtMatrixLayoutAttribute_t = cublasLtMatrixLayoutAttribute_t(
+        3,
+    );
+}
+impl cublasLtMatrixLayoutAttribute_t {
+    /** Matrix leading dimension.
+
+ For CUBLASLT_ORDER_COL this is stride (in elements) of matrix column, for more details and documentation for
+ other memory orders see documentation for cublasLtOrder_t values.
+
+ Currently only non-negative values are supported, must be large enough so that matrix memory locations are not
+ overlapping (e.g. greater or equal to CUBLASLT_MATRIX_LAYOUT_ROWS in case of CUBLASLT_ORDER_COL).
+
+ int64_t;*/
+    pub const CUBLASLT_MATRIX_LAYOUT_LD: cublasLtMatrixLayoutAttribute_t = cublasLtMatrixLayoutAttribute_t(
+        4,
+    );
+}
+impl cublasLtMatrixLayoutAttribute_t {
+    /** Number of matmul operations to perform in the batch.
+
+ See also CUBLASLT_ALGO_CAP_STRIDED_BATCH_SUPPORT
+
+ int32_t, default: 1*/
+    pub const CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT: cublasLtMatrixLayoutAttribute_t = cublasLtMatrixLayoutAttribute_t(
+        5,
+    );
+}
+impl cublasLtMatrixLayoutAttribute_t {
+    /** Stride (in elements) to the next matrix for strided batch operation.
+
+ When matrix type is planar-complex (CUBLASLT_MATRIX_LAYOUT_PLANE_OFFSET != 0), batch stride
+ is interpreted by cublasLtMatmul() in number of real valued sub-elements. E.g. for data of type CUDA_C_16F,
+ offset of 1024B is encoded as a stride of value 512 (since each element of the real and imaginary matrices
+ is a 2B (16bit) floating point type).
+
+ NOTE: A bug in cublasLtMatrixTransform() causes it to interpret the batch stride for a planar-complex matrix
+ as if it was specified in number of complex elements. Therefore an offset of 1024B must be encoded as stride
+ value 256 when calling cublasLtMatrixTransform() (each complex element is 4B with real and imaginary values 2B
+ each). This behavior is expected to be corrected in the next major cuBLAS version.
+
+ int64_t, default: 0*/
+    pub const CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET: cublasLtMatrixLayoutAttribute_t = cublasLtMatrixLayoutAttribute_t(
+        6,
+    );
+}
+impl cublasLtMatrixLayoutAttribute_t {
+    /** Stride (in bytes) to the imaginary plane for planar complex layout.
+
+ int64_t, default: 0 - 0 means that layout is regular (real and imaginary parts of complex numbers are interleaved
+ in memory in each element)*/
+    pub const CUBLASLT_MATRIX_LAYOUT_PLANE_OFFSET: cublasLtMatrixLayoutAttribute_t = cublasLtMatrixLayoutAttribute_t(
+        7,
+    );
+}
+#[repr(transparent)]
+/// Attributes of memory layout
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasLtMatrixLayoutAttribute_t(pub ::core::ffi::c_uint);
+impl cublasLtMatmulDescAttributes_t {
+    /** Compute type, see cudaDataType. Defines data type used for multiply and accumulate operations and the
+ accumulator during matrix multiplication.
+
+ int32_t*/
+    pub const CUBLASLT_MATMUL_DESC_COMPUTE_TYPE: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        0,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Scale type, see cudaDataType. Defines data type of alpha and beta. Accumulator and value from matrix C are
+ typically converted to scale type before final scaling. Value is then converted from scale type to type of matrix
+ D before being stored in memory.
+
+ int32_t, default: same as CUBLASLT_MATMUL_DESC_COMPUTE_TYPE*/
+    pub const CUBLASLT_MATMUL_DESC_SCALE_TYPE: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        1,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Pointer mode of alpha and beta, see cublasLtPointerMode_t. When CUBLASLT_POINTER_MODE_DEVICE_VECTOR is in use,
+ alpha/beta vector lenghts must match number of output matrix rows.
+
+ int32_t, default: CUBLASLT_POINTER_MODE_HOST*/
+    pub const CUBLASLT_MATMUL_DESC_POINTER_MODE: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        2,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Transform of matrix A, see cublasOperation_t.
+
+ int32_t, default: CUBLAS_OP_N*/
+    pub const CUBLASLT_MATMUL_DESC_TRANSA: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        3,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Transform of matrix B, see cublasOperation_t.
+
+ int32_t, default: CUBLAS_OP_N*/
+    pub const CUBLASLT_MATMUL_DESC_TRANSB: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        4,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Transform of matrix C, see cublasOperation_t.
+
+ Currently only CUBLAS_OP_N is supported.
+
+ int32_t, default: CUBLAS_OP_N*/
+    pub const CUBLASLT_MATMUL_DESC_TRANSC: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        5,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Matrix fill mode, see cublasFillMode_t.
+
+ int32_t, default: CUBLAS_FILL_MODE_FULL*/
+    pub const CUBLASLT_MATMUL_DESC_FILL_MODE: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        6,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Epilogue function, see cublasLtEpilogue_t.
+
+ uint32_t, default: CUBLASLT_EPILOGUE_DEFAULT*/
+    pub const CUBLASLT_MATMUL_DESC_EPILOGUE: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        7,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Bias or bias gradient vector pointer in the device memory.
+
+ Bias case. See CUBLASLT_EPILOGUE_BIAS.
+ For bias data type see CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE.
+
+ Bias vector length must match matrix D rows count.
+
+ Bias gradient case. See CUBLASLT_EPILOGUE_DRELU_BGRAD and CUBLASLT_EPILOGUE_DGELU_BGRAD.
+ Bias gradient vector elements are the same type as the output elements
+ (Ctype) with the exception of IMMA kernels (see above).
+
+ Routines that don't dereference this pointer, like cublasLtMatmulAlgoGetHeuristic()
+ depend on its value to determine expected pointer alignment.
+
+ Bias case: const void *, default: NULL
+ Bias gradient case: void *, default: NULL*/
+    pub const CUBLASLT_MATMUL_DESC_BIAS_POINTER: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        8,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Batch stride for bias or bias gradient vector.
+
+ Used together with CUBLASLT_MATMUL_DESC_BIAS_POINTER when matrix D's CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT > 1.
+
+ int64_t, default: 0*/
+    pub const CUBLASLT_MATMUL_DESC_BIAS_BATCH_STRIDE: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        10,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Pointer for epilogue auxiliary buffer.
+
+ - Output vector for ReLu bit-mask in forward pass when CUBLASLT_EPILOGUE_RELU_AUX
+   or CUBLASLT_EPILOGUE_RELU_AUX_BIAS epilogue is used.
+ - Input vector for ReLu bit-mask in backward pass when
+   CUBLASLT_EPILOGUE_DRELU_BGRAD epilogue is used.
+
+ - Output of GELU input matrix in forward pass when
+   CUBLASLT_EPILOGUE_GELU_AUX_BIAS epilogue is used.
+ - Input of GELU input matrix for backward pass when
+   CUBLASLT_EPILOGUE_DGELU_BGRAD epilogue is used.
+
+ For aux data type see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_DATA_TYPE.
+
+ Routines that don't dereference this pointer, like cublasLtMatmulAlgoGetHeuristic()
+ depend on its value to determine expected pointer alignment.
+
+ Requires setting CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD attribute.
+
+ Forward pass: void *, default: NULL
+ Backward pass: const void *, default: NULL*/
+    pub const CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        11,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Leading dimension for epilogue auxiliary buffer.
+
+ - ReLu bit-mask matrix leading dimension in elements (i.e. bits)
+   when CUBLASLT_EPILOGUE_RELU_AUX, CUBLASLT_EPILOGUE_RELU_AUX_BIAS or CUBLASLT_EPILOGUE_DRELU_BGRAD epilogue is
+ used. Must be divisible by 128 and be no less than the number of rows in the output matrix.
+
+ - GELU input matrix leading dimension in elements
+   when CUBLASLT_EPILOGUE_GELU_AUX_BIAS or CUBLASLT_EPILOGUE_DGELU_BGRAD epilogue used.
+   Must be divisible by 8 and be no less than the number of rows in the output matrix.
+
+ int64_t, default: 0*/
+    pub const CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        12,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Batch stride for epilogue auxiliary buffer.
+
+ - ReLu bit-mask matrix batch stride in elements (i.e. bits)
+   when CUBLASLT_EPILOGUE_RELU_AUX, CUBLASLT_EPILOGUE_RELU_AUX_BIAS or CUBLASLT_EPILOGUE_DRELU_BGRAD epilogue is
+ used. Must be divisible by 128.
+
+ - GELU input matrix batch stride in elements
+   when CUBLASLT_EPILOGUE_GELU_AUX_BIAS or CUBLASLT_EPILOGUE_DGELU_BGRAD epilogue used.
+   Must be divisible by 8.
+
+ int64_t, default: 0*/
+    pub const CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_BATCH_STRIDE: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        13,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Batch stride for alpha vector.
+
+ Used together with CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST when matrix D's
+ CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT > 1. If CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO is set then
+ CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE must be set to 0 as this mode doesnt supported batched alpha vector.
+
+ int64_t, default: 0*/
+    pub const CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        14,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Number of SMs to target for parallel execution. Optimizes heuristics for execution on a different number of SMs
+  when user expects a concurrent stream to be using some of the device resources.
+
+  int32_t, default: 0 - use the number reported by the device.*/
+    pub const CUBLASLT_MATMUL_DESC_SM_COUNT_TARGET: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        15,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Device pointer to the scale factor value that converts data in matrix A to the compute data type range.
+
+  The scaling factor value must have the same type as the compute type.
+
+  If not specified, or set to NULL, the scaling factor is assumed to be 1.
+
+  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
+  will return CUBLAS_INVALID_VALUE.
+
+  const void *, default: NULL*/
+    pub const CUBLASLT_MATMUL_DESC_A_SCALE_POINTER: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        17,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Device pointer to the scale factor value to convert data in matrix B to compute data type range.
+
+  The scaling factor value must have the same type as the compute type.
+
+  If not specified, or set to NULL, the scaling factor is assumed to be 1.
+
+  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
+  will return CUBLAS_INVALID_VALUE.
+
+  const void *, default: NULL*/
+    pub const CUBLASLT_MATMUL_DESC_B_SCALE_POINTER: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        18,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Device pointer to the scale factor value to convert data in matrix C to compute data type range.
+
+  The scaling factor value must have the same type as the compute type.
+
+  If not specified, or set to NULL, the scaling factor is assumed to be 1.
+
+  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
+  will return CUBLAS_INVALID_VALUE.
+
+  const void *, default: NULL*/
+    pub const CUBLASLT_MATMUL_DESC_C_SCALE_POINTER: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        19,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Device pointer to the scale factor value to convert data in matrix D to compute data type range.
+
+  The scaling factor value must have the same type as the compute type.
+
+  If not specified, or set to NULL, the scaling factor is assumed to be 1.
+
+  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
+  will return CUBLAS_INVALID_VALUE.
+
+  const void *, default: NULL*/
+    pub const CUBLASLT_MATMUL_DESC_D_SCALE_POINTER: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        20,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Device pointer to the memory location that on completion will be set to the maximum of absolute values in the
+  output matrix.
+
+  The computed value has the same type as the compute type.
+
+  If not specified or set to NULL, the maximum absolute value is not computed. If set for an unsupported matrix
+  data, scale, and compute type combination, calling cublasLtMatmul() will return CUBLAS_INVALID_VALUE.
+
+  void *, default: NULL*/
+    pub const CUBLASLT_MATMUL_DESC_AMAX_D_POINTER: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        21,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Type of the data to be stored to the memory pointed to by CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+
+  If unset, the data type defaults to the type of elements of the output matrix with some exceptions, see details
+ below.
+
+  ReLu uses a bit-mask.
+
+  GELU input matrix elements type is the same as the type of elements of
+  the output matrix with some exceptions, see details below.
+
+  For fp8 kernels with output type CUDA_R_8F_E4M3 the aux data type can be CUDA_R_8F_E4M3 or CUDA_R_16F with some
+  restrictions.  See https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulDescAttributes_t for more details.
+
+  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
+  will return CUBLAS_INVALID_VALUE.
+
+  int32_t based on cudaDataType, default: -1*/
+    pub const CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_DATA_TYPE: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        22,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Device pointer to the scaling factor value to convert results from compute type data range to storage
+  data range in the auxiliary matrix that is set via CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+
+  The scaling factor value must have the same type as the compute type.
+
+  If not specified, or set to NULL, the scaling factor is assumed to be 1. If set for an unsupported matrix data,
+  scale, and compute type combination, calling cublasLtMatmul() will return CUBLAS_INVALID_VALUE.
+
+  void *, default: NULL*/
+    pub const CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_SCALE_POINTER: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        23,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Device pointer to the memory location that on completion will be set to the maximum of absolute values in the
+  buffer that is set via CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+
+  The computed value has the same type as the compute type.
+
+  If not specified or set to NULL, the maximum absolute value is not computed. If set for an unsupported matrix
+  data, scale, and compute type combination, calling cublasLtMatmul() will return CUBLAS_INVALID_VALUE.
+
+  void *, default: NULL*/
+    pub const CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_AMAX_POINTER: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        24,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Flag for managing fp8 fast accumulation mode.
+  When enabled, problem execution might be faster but at the cost of lower accuracy because intermediate results
+  will not periodically be promoted to a higher precision.
+
+  int8_t, default: 0 - fast accumulation mode is disabled.*/
+    pub const CUBLASLT_MATMUL_DESC_FAST_ACCUM: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        25,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Type of bias or bias gradient vector in the device memory.
+
+ Bias case: see CUBLASLT_EPILOGUE_BIAS.
+
+ Bias vector elements are the same type as the elements of output matrix (Dtype) with the following exceptions:
+ - IMMA kernels with computeType=CUDA_R_32I and Ctype=CUDA_R_8I where the bias vector elements
+   are the same type as alpha, beta (CUBLASLT_MATMUL_DESC_SCALE_TYPE=CUDA_R_32F)
+ - fp8 kernels with an output type of CUDA_R_32F, CUDA_R_8F_E4M3 or CUDA_R_8F_E5M2, See
+   https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmul for details.
+
+ int32_t based on cudaDataType, default: -1*/
+    pub const CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        26,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** EXPERIMENTAL, DEPRECATED: Number of atomic synchronization chunks in the row dimension of the output matrix D.
+
+ int32_t, default 0 (atomic synchronization disabled)*/
+    pub const CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_ROWS: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        27,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** EXPERIMENTAL, DEPRECATED: Number of atomic synchronization chunks in the column dimension of the output matrix D.
+
+ int32_t, default 0 (atomic synchronization disabled)*/
+    pub const CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_COLS: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        28,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** EXPERIMENTAL: Pointer to a device array of input atomic counters consumed by a matmul.
+
+ int32_t *, default: NULL*/
+    pub const CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_IN_COUNTERS_POINTER: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        29,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** EXPERIMENTAL: Pointer to a device array of output atomic counters produced by a matmul.
+
+ int32_t *, default: NULL*/
+    pub const CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_OUT_COUNTERS_POINTER: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        30,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Scaling mode that defines how the matrix scaling factor for matrix A is interpreted
+
+ int32_t, default: 0*/
+    pub const CUBLASLT_MATMUL_DESC_A_SCALE_MODE: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        31,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Scaling mode that defines how the matrix scaling factor for matrix B is interpreted
+
+ int32_t, default: 0*/
+    pub const CUBLASLT_MATMUL_DESC_B_SCALE_MODE: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        32,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Scaling mode that defines how the matrix scaling factor for matrix C is interpreted
+
+ int32_t, default: 0*/
+    pub const CUBLASLT_MATMUL_DESC_C_SCALE_MODE: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        33,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Scaling mode that defines how the matrix scaling factor for matrix D is interpreted
+
+ int32_t, default: 0*/
+    pub const CUBLASLT_MATMUL_DESC_D_SCALE_MODE: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        34,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Scaling mode that defines how the matrix scaling factor for the auxiliary matrix is interpreted
+
+ int32_t, default: 0*/
+    pub const CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_SCALE_MODE: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        35,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Device pointer to the scale factors that are used to convert data in matrix D to the compute data type range.
+
+  The scaling factor value type is defined by the scaling mode (see CUBLASLT_MATMUL_DESC_D_OUT_SCALE_MODE)
+
+  If set for an unsupported matrix data, scale, scale mode, and compute type combination, calling cublasLtMatmul()
+  will return CUBLAS_INVALID_VALUE.
+
+  void *, default: NULL*/
+    pub const CUBLASLT_MATMUL_DESC_D_OUT_SCALE_POINTER: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        36,
+    );
+}
+impl cublasLtMatmulDescAttributes_t {
+    /** Scaling mode that defines how the output matrix scaling factor for matrix D is interpreted
+
+ int32_t, default: 0*/
+    pub const CUBLASLT_MATMUL_DESC_D_OUT_SCALE_MODE: cublasLtMatmulDescAttributes_t = cublasLtMatmulDescAttributes_t(
+        37,
+    );
+}
+#[repr(transparent)]
+/// Matmul descriptor attributes to define details of the operation.
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasLtMatmulDescAttributes_t(pub ::core::ffi::c_uint);
+impl cublasLtMatrixTransformDescAttributes_t {
+    /** Scale type, see cudaDataType. Inputs are converted to scale type for scaling and summation and results are then
+ converted to output type to store in memory.
+
+ int32_t*/
+    pub const CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE: cublasLtMatrixTransformDescAttributes_t = cublasLtMatrixTransformDescAttributes_t(
+        0,
+    );
+}
+impl cublasLtMatrixTransformDescAttributes_t {
+    /** Pointer mode of alpha and beta, see cublasLtPointerMode_t.
+
+ int32_t, default: CUBLASLT_POINTER_MODE_HOST*/
+    pub const CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE: cublasLtMatrixTransformDescAttributes_t = cublasLtMatrixTransformDescAttributes_t(
+        1,
+    );
+}
+impl cublasLtMatrixTransformDescAttributes_t {
+    /** Transform of matrix A, see cublasOperation_t.
+
+ int32_t, default: CUBLAS_OP_N*/
+    pub const CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA: cublasLtMatrixTransformDescAttributes_t = cublasLtMatrixTransformDescAttributes_t(
+        2,
+    );
+}
+impl cublasLtMatrixTransformDescAttributes_t {
+    /** Transform of matrix B, see cublasOperation_t.
+
+ int32_t, default: CUBLAS_OP_N*/
+    pub const CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSB: cublasLtMatrixTransformDescAttributes_t = cublasLtMatrixTransformDescAttributes_t(
+        3,
+    );
+}
+#[repr(transparent)]
+/// Matrix transform descriptor attributes to define details of the operation.
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasLtMatrixTransformDescAttributes_t(pub ::core::ffi::c_uint);
+impl cublasLtReductionScheme_t {
+    /// No reduction scheme, dot-product shall be performed in one sequence.
+    pub const CUBLASLT_REDUCTION_SCHEME_NONE: cublasLtReductionScheme_t = cublasLtReductionScheme_t(
+        0,
+    );
+}
+impl cublasLtReductionScheme_t {
+    /** Reduction is performed "in place" - using the output buffer (and output data type) and counters (in workspace) to
+ guarantee the sequentiality.*/
+    pub const CUBLASLT_REDUCTION_SCHEME_INPLACE: cublasLtReductionScheme_t = cublasLtReductionScheme_t(
+        1,
+    );
+}
+impl cublasLtReductionScheme_t {
+    /// Intermediate results are stored in compute type in the workspace and reduced in a separate step.
+    pub const CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE: cublasLtReductionScheme_t = cublasLtReductionScheme_t(
+        2,
+    );
+}
+impl cublasLtReductionScheme_t {
+    /// Intermediate results are stored in output type in the workspace and reduced in a separate step.
+    pub const CUBLASLT_REDUCTION_SCHEME_OUTPUT_TYPE: cublasLtReductionScheme_t = cublasLtReductionScheme_t(
+        4,
+    );
+}
+impl cublasLtReductionScheme_t {
+    /// Intermediate results are stored in output type in the workspace and reduced in a separate step.
+    pub const CUBLASLT_REDUCTION_SCHEME_MASK: cublasLtReductionScheme_t = cublasLtReductionScheme_t(
+        7,
+    );
+}
+#[repr(transparent)]
+/// Reduction scheme for portions of the dot-product calculated in parallel (a. k. a. "split - K").
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasLtReductionScheme_t(pub ::core::ffi::c_uint);
+impl cublasLtEpilogue_t {
+    /// No special postprocessing, just scale and quantize results if necessary.
+    pub const CUBLASLT_EPILOGUE_DEFAULT: cublasLtEpilogue_t = cublasLtEpilogue_t(1);
+}
+impl cublasLtEpilogue_t {
+    /// ReLu, apply ReLu point-wise transform to the results (x:=max(x, 0)).
+    pub const CUBLASLT_EPILOGUE_RELU: cublasLtEpilogue_t = cublasLtEpilogue_t(2);
+}
+impl cublasLtEpilogue_t {
+    /** ReLu, apply ReLu point-wise transform to the results (x:=max(x, 0)).
+
+ This epilogue mode produces an extra output, a ReLu bit-mask matrix,
+ see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.*/
+    pub const CUBLASLT_EPILOGUE_RELU_AUX: cublasLtEpilogue_t = cublasLtEpilogue_t(130);
+}
+impl cublasLtEpilogue_t {
+    /** Bias, apply (broadcasted) Bias from bias vector. Bias vector length must match matrix D rows, it must be packed
+ (stride between vector elements is 1). Bias vector is broadcasted to all columns and added before applying final
+ postprocessing.*/
+    pub const CUBLASLT_EPILOGUE_BIAS: cublasLtEpilogue_t = cublasLtEpilogue_t(4);
+}
+impl cublasLtEpilogue_t {
+    /// ReLu and Bias, apply Bias and then ReLu transform
+    pub const CUBLASLT_EPILOGUE_RELU_BIAS: cublasLtEpilogue_t = cublasLtEpilogue_t(6);
+}
+impl cublasLtEpilogue_t {
+    /** ReLu and Bias, apply Bias and then ReLu transform
+
+ This epilogue mode produces an extra output, a ReLu bit-mask matrix,
+ see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.*/
+    pub const CUBLASLT_EPILOGUE_RELU_AUX_BIAS: cublasLtEpilogue_t = cublasLtEpilogue_t(
+        134,
+    );
+}
+impl cublasLtEpilogue_t {
+    /** ReLu and Bias, apply Bias and then ReLu transform
+
+ This epilogue mode produces an extra output, a ReLu bit-mask matrix,
+ see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.*/
+    pub const CUBLASLT_EPILOGUE_DRELU: cublasLtEpilogue_t = cublasLtEpilogue_t(136);
+}
+impl cublasLtEpilogue_t {
+    /** ReLu and Bias, apply Bias and then ReLu transform
+
+ This epilogue mode produces an extra output, a ReLu bit-mask matrix,
+ see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.*/
+    pub const CUBLASLT_EPILOGUE_DRELU_BGRAD: cublasLtEpilogue_t = cublasLtEpilogue_t(
+        152,
+    );
+}
+impl cublasLtEpilogue_t {
+    /// GELU, apply GELU point-wise transform to the results (x:=GELU(x)).
+    pub const CUBLASLT_EPILOGUE_GELU: cublasLtEpilogue_t = cublasLtEpilogue_t(32);
+}
+impl cublasLtEpilogue_t {
+    /** GELU, apply GELU point-wise transform to the results (x:=GELU(x)).
+
+ This epilogue mode outputs GELU input as a separate matrix (useful for training).
+ See CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.*/
+    pub const CUBLASLT_EPILOGUE_GELU_AUX: cublasLtEpilogue_t = cublasLtEpilogue_t(160);
+}
+impl cublasLtEpilogue_t {
+    /// GELU and Bias, apply Bias and then GELU transform
+    pub const CUBLASLT_EPILOGUE_GELU_BIAS: cublasLtEpilogue_t = cublasLtEpilogue_t(36);
+}
+impl cublasLtEpilogue_t {
+    /** GELU and Bias, apply Bias and then GELU transform
+
+ This epilogue mode outputs GELU input as a separate matrix (useful for training).
+ See CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.*/
+    pub const CUBLASLT_EPILOGUE_GELU_AUX_BIAS: cublasLtEpilogue_t = cublasLtEpilogue_t(
+        164,
+    );
+}
+impl cublasLtEpilogue_t {
+    /** GELU and Bias, apply Bias and then GELU transform
+
+ This epilogue mode outputs GELU input as a separate matrix (useful for training).
+ See CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.*/
+    pub const CUBLASLT_EPILOGUE_DGELU: cublasLtEpilogue_t = cublasLtEpilogue_t(192);
+}
+impl cublasLtEpilogue_t {
+    /** GELU and Bias, apply Bias and then GELU transform
+
+ This epilogue mode outputs GELU input as a separate matrix (useful for training).
+ See CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.*/
+    pub const CUBLASLT_EPILOGUE_DGELU_BGRAD: cublasLtEpilogue_t = cublasLtEpilogue_t(
+        208,
+    );
+}
+impl cublasLtEpilogue_t {
+    /** Bias gradient based on the input matrix A.
+
+ The bias size corresponds to the number of rows of the matrix D.
+ The reduction happens over the GEMM's "k" dimension.
+
+ Stores Bias gradient in the auxiliary output
+ (see CUBLASLT_MATMUL_DESC_BIAS_POINTER).*/
+    pub const CUBLASLT_EPILOGUE_BGRADA: cublasLtEpilogue_t = cublasLtEpilogue_t(256);
+}
+impl cublasLtEpilogue_t {
+    /** Bias gradient based on the input matrix B.
+
+ The bias size corresponds to the number of columns of the matrix D.
+ The reduction happens over the GEMM's "k" dimension.
+
+ Stores Bias gradient in the auxiliary output
+ (see CUBLASLT_MATMUL_DESC_BIAS_POINTER).*/
+    pub const CUBLASLT_EPILOGUE_BGRADB: cublasLtEpilogue_t = cublasLtEpilogue_t(512);
+}
+#[repr(transparent)]
+/// Postprocessing options for the epilogue
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasLtEpilogue_t(pub ::core::ffi::c_uint);
+impl cublasLtMatmulSearch_t {
+    /// ask heuristics for best algo for given usecase
+    pub const CUBLASLT_SEARCH_BEST_FIT: cublasLtMatmulSearch_t = cublasLtMatmulSearch_t(
+        0,
+    );
+}
+impl cublasLtMatmulSearch_t {
+    /// only try to find best config for preconfigured algo id
+    pub const CUBLASLT_SEARCH_LIMITED_BY_ALGO_ID: cublasLtMatmulSearch_t = cublasLtMatmulSearch_t(
+        1,
+    );
+}
+impl cublasLtMatmulSearch_t {
+    /// reserved for future use
+    pub const CUBLASLT_SEARCH_RESERVED_02: cublasLtMatmulSearch_t = cublasLtMatmulSearch_t(
+        2,
+    );
+}
+impl cublasLtMatmulSearch_t {
+    /// reserved for future use
+    pub const CUBLASLT_SEARCH_RESERVED_03: cublasLtMatmulSearch_t = cublasLtMatmulSearch_t(
+        3,
+    );
+}
+impl cublasLtMatmulSearch_t {
+    /// reserved for future use
+    pub const CUBLASLT_SEARCH_RESERVED_04: cublasLtMatmulSearch_t = cublasLtMatmulSearch_t(
+        4,
+    );
+}
+impl cublasLtMatmulSearch_t {
+    /// reserved for future use
+    pub const CUBLASLT_SEARCH_RESERVED_05: cublasLtMatmulSearch_t = cublasLtMatmulSearch_t(
+        5,
+    );
+}
+impl cublasLtMatmulSearch_t {
+    /// reserved for future use
+    pub const CUBLASLT_SEARCH_RESERVED_06: cublasLtMatmulSearch_t = cublasLtMatmulSearch_t(
+        6,
+    );
+}
+impl cublasLtMatmulSearch_t {
+    /// reserved for future use
+    pub const CUBLASLT_SEARCH_RESERVED_07: cublasLtMatmulSearch_t = cublasLtMatmulSearch_t(
+        7,
+    );
+}
+impl cublasLtMatmulSearch_t {
+    /// reserved for future use
+    pub const CUBLASLT_SEARCH_RESERVED_08: cublasLtMatmulSearch_t = cublasLtMatmulSearch_t(
+        8,
+    );
+}
+impl cublasLtMatmulSearch_t {
+    /// reserved for future use
+    pub const CUBLASLT_SEARCH_RESERVED_09: cublasLtMatmulSearch_t = cublasLtMatmulSearch_t(
+        9,
+    );
+}
+#[repr(transparent)]
+/// Matmul heuristic search mode
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasLtMatmulSearch_t(pub ::core::ffi::c_uint);
+impl cublasLtMatmulPreferenceAttributes_t {
+    /** Search mode, see cublasLtMatmulSearch_t.
+
+ uint32_t, default: CUBLASLT_SEARCH_BEST_FIT*/
+    pub const CUBLASLT_MATMUL_PREF_SEARCH_MODE: cublasLtMatmulPreferenceAttributes_t = cublasLtMatmulPreferenceAttributes_t(
+        0,
+    );
+}
+impl cublasLtMatmulPreferenceAttributes_t {
+    /** Maximum allowed workspace size in bytes.
+
+ uint64_t, default: 0 - no workspace allowed*/
+    pub const CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES: cublasLtMatmulPreferenceAttributes_t = cublasLtMatmulPreferenceAttributes_t(
+        1,
+    );
+}
+impl cublasLtMatmulPreferenceAttributes_t {
+    /** Reduction scheme mask, see cublasLtReductionScheme_t. Filters heuristic result to only include algo configs that
+ use one of the required modes.
+
+ E.g. mask value of 0x03 will allow only INPLACE and COMPUTE_TYPE reduction schemes.
+
+ uint32_t, default: CUBLASLT_REDUCTION_SCHEME_MASK (allows all reduction schemes)*/
+    pub const CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK: cublasLtMatmulPreferenceAttributes_t = cublasLtMatmulPreferenceAttributes_t(
+        3,
+    );
+}
+impl cublasLtMatmulPreferenceAttributes_t {
+    /** Minimum buffer alignment for matrix A (in bytes).
+
+ Selecting a smaller value will exclude algorithms that can not work with matrix A that is not as strictly aligned
+ as they need.
+
+ uint32_t, default: 256*/
+    pub const CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES: cublasLtMatmulPreferenceAttributes_t = cublasLtMatmulPreferenceAttributes_t(
+        5,
+    );
+}
+impl cublasLtMatmulPreferenceAttributes_t {
+    /** Minimum buffer alignment for matrix B (in bytes).
+
+ Selecting a smaller value will exclude algorithms that can not work with matrix B that is not as strictly aligned
+ as they need.
+
+ uint32_t, default: 256*/
+    pub const CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES: cublasLtMatmulPreferenceAttributes_t = cublasLtMatmulPreferenceAttributes_t(
+        6,
+    );
+}
+impl cublasLtMatmulPreferenceAttributes_t {
+    /** Minimum buffer alignment for matrix C (in bytes).
+
+ Selecting a smaller value will exclude algorithms that can not work with matrix C that is not as strictly aligned
+ as they need.
+
+ uint32_t, default: 256*/
+    pub const CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES: cublasLtMatmulPreferenceAttributes_t = cublasLtMatmulPreferenceAttributes_t(
+        7,
+    );
+}
+impl cublasLtMatmulPreferenceAttributes_t {
+    /** Minimum buffer alignment for matrix D (in bytes).
+
+ Selecting a smaller value will exclude algorithms that can not work with matrix D that is not as strictly aligned
+ as they need.
+
+ uint32_t, default: 256*/
+    pub const CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_D_BYTES: cublasLtMatmulPreferenceAttributes_t = cublasLtMatmulPreferenceAttributes_t(
+        8,
+    );
+}
+impl cublasLtMatmulPreferenceAttributes_t {
+    /** Maximum wave count.
+
+ See cublasLtMatmulHeuristicResult_t::wavesCount.
+
+ Selecting a non-zero value will exclude algorithms that report device utilization higher than specified.
+
+ float, default: 0.0f*/
+    pub const CUBLASLT_MATMUL_PREF_MAX_WAVES_COUNT: cublasLtMatmulPreferenceAttributes_t = cublasLtMatmulPreferenceAttributes_t(
+        9,
+    );
+}
+impl cublasLtMatmulPreferenceAttributes_t {
+    /** Numerical implementation details mask, see cublasLtNumericalImplFlags_t. Filters heuristic result to only include
+ algorithms that use the allowed implementations.
+
+ uint64_t, default: uint64_t(-1) (allow everything)*/
+    pub const CUBLASLT_MATMUL_PREF_IMPL_MASK: cublasLtMatmulPreferenceAttributes_t = cublasLtMatmulPreferenceAttributes_t(
+        12,
+    );
+}
+#[repr(transparent)]
+/// Algo search preference to fine tune the heuristic function.
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasLtMatmulPreferenceAttributes_t(pub ::core::ffi::c_uint);
+/** Results structure used by cublasLtMatmulAlgoGetHeuristic
+
+ Holds returned configured algo descriptor and its runtime properties.*/
+#[repr(C)]
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub struct cublasLtMatmulHeuristicResult_t {
+    /** Matmul algorithm descriptor.
+
+ Must be initialized with cublasLtMatmulAlgoInit() if preferences' CUBLASLT_MATMUL_PERF_SEARCH_MODE is set to
+ CUBLASLT_SEARCH_LIMITED_BY_ALGO_ID*/
+    pub algo: cublasLtMatmulAlgo_t,
+    /// Actual size of workspace memory required.
+    pub workspaceSize: usize,
+    /** Result status, other fields are only valid if after call to cublasLtMatmulAlgoGetHeuristic() this member is set to
+ CUBLAS_STATUS_SUCCESS.*/
+    pub state: cublasStatus_t,
+    /** Waves count - a device utilization metric.
+
+ wavesCount value of 1.0f suggests that when kernel is launched it will fully occupy the GPU.*/
+    pub wavesCount: f32,
+    pub reserved: [::core::ffi::c_int; 4usize],
+}
+impl cublasLtMatmulAlgoCapAttributes_t {
+    /** support for split K, see CUBLASLT_ALGO_CONFIG_SPLITK_NUM
+
+ int32_t, 0 means no support, supported otherwise*/
+    pub const CUBLASLT_ALGO_CAP_SPLITK_SUPPORT: cublasLtMatmulAlgoCapAttributes_t = cublasLtMatmulAlgoCapAttributes_t(
+        0,
+    );
+}
+impl cublasLtMatmulAlgoCapAttributes_t {
+    /** reduction scheme mask, see cublasLtReductionScheme_t; shows supported reduction schemes, if reduction scheme is
+ not masked out it is supported.
+
+ e.g. int isReductionSchemeComputeTypeSupported ? (reductionSchemeMask & CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE) ==
+ CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE ? 1 : 0;
+
+ uint32_t*/
+    pub const CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK: cublasLtMatmulAlgoCapAttributes_t = cublasLtMatmulAlgoCapAttributes_t(
+        1,
+    );
+}
+impl cublasLtMatmulAlgoCapAttributes_t {
+    /** support for cta swizzling, see CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING
+
+ uint32_t, 0 means no support, 1 means supported value of 1, other values are reserved*/
+    pub const CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT: cublasLtMatmulAlgoCapAttributes_t = cublasLtMatmulAlgoCapAttributes_t(
+        2,
+    );
+}
+impl cublasLtMatmulAlgoCapAttributes_t {
+    /** support strided batch
+
+ int32_t, 0 means no support, supported otherwise*/
+    pub const CUBLASLT_ALGO_CAP_STRIDED_BATCH_SUPPORT: cublasLtMatmulAlgoCapAttributes_t = cublasLtMatmulAlgoCapAttributes_t(
+        3,
+    );
+}
+impl cublasLtMatmulAlgoCapAttributes_t {
+    /** support results out of place (D != C in D = alpha.A.B + beta.C)
+
+ int32_t, 0 means no support, supported otherwise*/
+    pub const CUBLASLT_ALGO_CAP_OUT_OF_PLACE_RESULT_SUPPORT: cublasLtMatmulAlgoCapAttributes_t = cublasLtMatmulAlgoCapAttributes_t(
+        4,
+    );
+}
+impl cublasLtMatmulAlgoCapAttributes_t {
+    /** syrk/herk support (on top of regular gemm)
+
+ int32_t, 0 means no support, supported otherwise*/
+    pub const CUBLASLT_ALGO_CAP_UPLO_SUPPORT: cublasLtMatmulAlgoCapAttributes_t = cublasLtMatmulAlgoCapAttributes_t(
+        5,
+    );
+}
+impl cublasLtMatmulAlgoCapAttributes_t {
+    /** tile ids possible to use, see cublasLtMatmulTile_t; if no tile ids are supported use
+ CUBLASLT_MATMUL_TILE_UNDEFINED
+
+ use cublasLtMatmulAlgoCapGetAttribute() with sizeInBytes=0 to query actual count
+
+ array of uint32_t*/
+    pub const CUBLASLT_ALGO_CAP_TILE_IDS: cublasLtMatmulAlgoCapAttributes_t = cublasLtMatmulAlgoCapAttributes_t(
+        6,
+    );
+}
+impl cublasLtMatmulAlgoCapAttributes_t {
+    /** custom option range is from 0 to CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX (inclusive), see
+ CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION
+
+ int32_t*/
+    pub const CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX: cublasLtMatmulAlgoCapAttributes_t = cublasLtMatmulAlgoCapAttributes_t(
+        7,
+    );
+}
+impl cublasLtMatmulAlgoCapAttributes_t {
+    /** whether algorithm supports custom (not COL or ROW memory order), see cublasLtOrder_t
+
+ int32_t 0 means only COL and ROW memory order is allowed, non-zero means that algo might have different
+ requirements;*/
+    pub const CUBLASLT_ALGO_CAP_CUSTOM_MEMORY_ORDER: cublasLtMatmulAlgoCapAttributes_t = cublasLtMatmulAlgoCapAttributes_t(
+        10,
+    );
+}
+impl cublasLtMatmulAlgoCapAttributes_t {
+    /** bitmask enumerating pointer modes algorithm supports
+
+ uint32_t, see cublasLtPointerModeMask_t*/
+    pub const CUBLASLT_ALGO_CAP_POINTER_MODE_MASK: cublasLtMatmulAlgoCapAttributes_t = cublasLtMatmulAlgoCapAttributes_t(
+        11,
+    );
+}
+impl cublasLtMatmulAlgoCapAttributes_t {
+    /** bitmask enumerating kinds of postprocessing algorithm supports in the epilogue
+
+ uint32_t, see cublasLtEpilogue_t*/
+    pub const CUBLASLT_ALGO_CAP_EPILOGUE_MASK: cublasLtMatmulAlgoCapAttributes_t = cublasLtMatmulAlgoCapAttributes_t(
+        12,
+    );
+}
+impl cublasLtMatmulAlgoCapAttributes_t {
+    /** stages ids possible to use, see cublasLtMatmulStages_t; if no stages ids are supported use
+ CUBLASLT_MATMUL_STAGES_UNDEFINED
+
+ use cublasLtMatmulAlgoCapGetAttribute() with sizeInBytes=0 to query actual count
+
+ array of uint32_t*/
+    pub const CUBLASLT_ALGO_CAP_STAGES_IDS: cublasLtMatmulAlgoCapAttributes_t = cublasLtMatmulAlgoCapAttributes_t(
+        13,
+    );
+}
+impl cublasLtMatmulAlgoCapAttributes_t {
+    /** support for nagative ld for all of the matrices
+
+ int32_t 0 means no support, supported otherwise*/
+    pub const CUBLASLT_ALGO_CAP_LD_NEGATIVE: cublasLtMatmulAlgoCapAttributes_t = cublasLtMatmulAlgoCapAttributes_t(
+        14,
+    );
+}
+impl cublasLtMatmulAlgoCapAttributes_t {
+    /** details about algorithm's implementation that affect it's numerical behavior
+
+ uint64_t, see cublasLtNumericalImplFlags_t*/
+    pub const CUBLASLT_ALGO_CAP_NUMERICAL_IMPL_FLAGS: cublasLtMatmulAlgoCapAttributes_t = cublasLtMatmulAlgoCapAttributes_t(
+        15,
+    );
+}
+impl cublasLtMatmulAlgoCapAttributes_t {
+    /** minimum alignment required for A matrix in bytes
+  (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order)
+
+ uint32_t*/
+    pub const CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_A_BYTES: cublasLtMatmulAlgoCapAttributes_t = cublasLtMatmulAlgoCapAttributes_t(
+        16,
+    );
+}
+impl cublasLtMatmulAlgoCapAttributes_t {
+    /** minimum alignment required for B matrix in bytes
+  (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order)
+
+ uint32_t*/
+    pub const CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_B_BYTES: cublasLtMatmulAlgoCapAttributes_t = cublasLtMatmulAlgoCapAttributes_t(
+        17,
+    );
+}
+impl cublasLtMatmulAlgoCapAttributes_t {
+    /** minimum alignment required for C matrix in bytes
+  (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order)
+
+ uint32_t*/
+    pub const CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_C_BYTES: cublasLtMatmulAlgoCapAttributes_t = cublasLtMatmulAlgoCapAttributes_t(
+        18,
+    );
+}
+impl cublasLtMatmulAlgoCapAttributes_t {
+    /** minimum alignment required for D matrix in bytes
+  (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order)
+
+ uint32_t*/
+    pub const CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_D_BYTES: cublasLtMatmulAlgoCapAttributes_t = cublasLtMatmulAlgoCapAttributes_t(
+        19,
+    );
+}
+impl cublasLtMatmulAlgoCapAttributes_t {
+    /** EXPERIMENTAL: support for synchronization via atomic counters
+
+ int32_t*/
+    pub const CUBLASLT_ALGO_CAP_ATOMIC_SYNC: cublasLtMatmulAlgoCapAttributes_t = cublasLtMatmulAlgoCapAttributes_t(
+        20,
+    );
+}
+#[repr(transparent)]
+/// Capabilities Attributes that can be retrieved from an initialized Algo structure
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasLtMatmulAlgoCapAttributes_t(pub ::core::ffi::c_uint);
+impl cublasLtMatmulAlgoConfigAttributes_t {
+    /** algorithm index, see cublasLtMatmulAlgoGetIds()
+
+ readonly, set by cublasLtMatmulAlgoInit()
+ int32_t*/
+    pub const CUBLASLT_ALGO_CONFIG_ID: cublasLtMatmulAlgoConfigAttributes_t = cublasLtMatmulAlgoConfigAttributes_t(
+        0,
+    );
+}
+impl cublasLtMatmulAlgoConfigAttributes_t {
+    /** tile id, see cublasLtMatmulTile_t
+
+ uint32_t, default: CUBLASLT_MATMUL_TILE_UNDEFINED*/
+    pub const CUBLASLT_ALGO_CONFIG_TILE_ID: cublasLtMatmulAlgoConfigAttributes_t = cublasLtMatmulAlgoConfigAttributes_t(
+        1,
+    );
+}
+impl cublasLtMatmulAlgoConfigAttributes_t {
+    /** Number of K splits. If the number of K splits is greater than one, SPLITK_NUM parts
+ of matrix multiplication will be computed in parallel. The results will be accumulated
+ according to CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME
+
+ int32_t, default: 1*/
+    pub const CUBLASLT_ALGO_CONFIG_SPLITK_NUM: cublasLtMatmulAlgoConfigAttributes_t = cublasLtMatmulAlgoConfigAttributes_t(
+        2,
+    );
+}
+impl cublasLtMatmulAlgoConfigAttributes_t {
+    /** reduction scheme, see cublasLtReductionScheme_t
+
+ uint32_t, default: CUBLASLT_REDUCTION_SCHEME_NONE*/
+    pub const CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME: cublasLtMatmulAlgoConfigAttributes_t = cublasLtMatmulAlgoConfigAttributes_t(
+        3,
+    );
+}
+impl cublasLtMatmulAlgoConfigAttributes_t {
+    /** cta swizzling, change mapping from CUDA grid coordinates to parts of the matrices
+
+ possible values: 0, 1, other values reserved
+
+ uint32_t, default: 0*/
+    pub const CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING: cublasLtMatmulAlgoConfigAttributes_t = cublasLtMatmulAlgoConfigAttributes_t(
+        4,
+    );
+}
+impl cublasLtMatmulAlgoConfigAttributes_t {
+    /** custom option, each algorithm can support some custom options that don't fit description of the other config
+ attributes, see CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX to get accepted range for any specific case
+
+ uint32_t, default: 0*/
+    pub const CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION: cublasLtMatmulAlgoConfigAttributes_t = cublasLtMatmulAlgoConfigAttributes_t(
+        5,
+    );
+}
+impl cublasLtMatmulAlgoConfigAttributes_t {
+    /** stages id, see cublasLtMatmulStages_t
+
+ uint32_t, default: CUBLASLT_MATMUL_STAGES_UNDEFINED*/
+    pub const CUBLASLT_ALGO_CONFIG_STAGES_ID: cublasLtMatmulAlgoConfigAttributes_t = cublasLtMatmulAlgoConfigAttributes_t(
+        6,
+    );
+}
+impl cublasLtMatmulAlgoConfigAttributes_t {
+    /** inner shape id, see cublasLtMatmulInnerShape_t
+
+ uint16_t, default: 0 (CUBLASLT_MATMUL_INNER_SHAPE_UNDEFINED)*/
+    pub const CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID: cublasLtMatmulAlgoConfigAttributes_t = cublasLtMatmulAlgoConfigAttributes_t(
+        7,
+    );
+}
+impl cublasLtMatmulAlgoConfigAttributes_t {
+    /** Thread Block Cluster shape id, see cublasLtClusterShape_t. Defines cluster size to use.
+
+ uint16_t, default: 0 (CUBLASLT_CLUSTER_SHAPE_AUTO)*/
+    pub const CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID: cublasLtMatmulAlgoConfigAttributes_t = cublasLtMatmulAlgoConfigAttributes_t(
+        8,
+    );
+}
+#[repr(transparent)]
+/// Algo Configuration Attributes that can be set according to the Algo capabilities
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cublasLtMatmulAlgoConfigAttributes_t(pub ::core::ffi::c_uint);
+/// Experimental: Logger callback type.
+pub type cublasLtLoggerCallback_t = ::core::option::Option<
+    unsafe extern "C" fn(
+        logLevel: ::core::ffi::c_int,
+        functionName: *const ::core::ffi::c_char,
+        message: *const ::core::ffi::c_char,
+    ),
+>;
diff --git a/cuda_types/src/cuda.rs b/cuda_types/src/cuda.rs
index 2c2716a..4cc6cd6 100644
--- a/cuda_types/src/cuda.rs
+++ b/cuda_types/src/cuda.rs
@@ -1,7 +1,7 @@
 // Generated automatically by zluda_bindgen
 // DO NOT EDIT MANUALLY
 #![allow(warnings)]
-pub const CUDA_VERSION: u32 = 12040;
+pub const CUDA_VERSION: u32 = 12080;
 pub const CU_IPC_HANDLE_SIZE: u32 = 64;
 pub const CU_COMPUTE_ACCELERATED_TARGET_BASE: u32 = 65536;
 pub const CU_GRAPH_COND_ASSIGN_DEFAULT: u32 = 1;
@@ -23,6 +23,8 @@ pub const CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC: u32 = 2;
 pub const CUDA_NVSCISYNC_ATTR_SIGNAL: u32 = 1;
 pub const CUDA_NVSCISYNC_ATTR_WAIT: u32 = 2;
 pub const CU_MEM_CREATE_USAGE_TILE_POOL: u32 = 1;
+pub const CU_MEM_CREATE_USAGE_HW_DECOMPRESS: u32 = 2;
+pub const CU_MEM_POOL_CREATE_USAGE_HW_DECOMPRESS: u32 = 2;
 pub const CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC: u32 = 1;
 pub const CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC: u32 = 2;
 pub const CUDA_ARRAY3D_LAYERED: u32 = 1;
@@ -34,6 +36,7 @@ pub const CUDA_ARRAY3D_DEPTH_TEXTURE: u32 = 16;
 pub const CUDA_ARRAY3D_COLOR_ATTACHMENT: u32 = 32;
 pub const CUDA_ARRAY3D_SPARSE: u32 = 64;
 pub const CUDA_ARRAY3D_DEFERRED_MAPPING: u32 = 128;
+pub const CUDA_ARRAY3D_VIDEO_ENCODE_DECODE: u32 = 256;
 pub const CU_TRSA_OVERRIDE_FORMAT: u32 = 1;
 pub const CU_TRSF_READ_AS_INTEGER: u32 = 1;
 pub const CU_TRSF_NORMALIZED_COORDINATES: u32 = 2;
@@ -195,6 +198,15 @@ pub struct CUasyncCallbackEntry_st {
 }
 pub type CUasyncCallbackHandle = *mut CUasyncCallbackEntry_st;
 #[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct CUgreenCtx_st {
+    _unused: [u8; 0],
+}
+/** \typedef typedef struct CUgreenCtx_st* CUgreenCtx
+ A green context handle. This handle can be used safely from only one CPU thread at a time.
+ Created via ::cuGreenCtxCreate*/
+pub type CUgreenCtx = *mut CUgreenCtx_st;
+#[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct CUuuid_st {
     pub bytes: [::core::ffi::c_uchar; 16usize],
@@ -871,6 +883,79 @@ impl CUarray_format_enum {
         158,
     );
 }
+impl CUarray_format_enum {
+    ///< 10-bit YUV planar format, with 4:2:0 sampling
+    pub const CU_AD_FORMAT_P010: CUarray_format_enum = CUarray_format_enum(159);
+}
+impl CUarray_format_enum {
+    ///< 16-bit YUV planar format, with 4:2:0 sampling
+    pub const CU_AD_FORMAT_P016: CUarray_format_enum = CUarray_format_enum(161);
+}
+impl CUarray_format_enum {
+    ///< 8-bit YUV planar format, with 4:2:2 sampling
+    pub const CU_AD_FORMAT_NV16: CUarray_format_enum = CUarray_format_enum(162);
+}
+impl CUarray_format_enum {
+    ///< 10-bit YUV planar format, with 4:2:2 sampling
+    pub const CU_AD_FORMAT_P210: CUarray_format_enum = CUarray_format_enum(163);
+}
+impl CUarray_format_enum {
+    ///< 16-bit YUV planar format, with 4:2:2 sampling
+    pub const CU_AD_FORMAT_P216: CUarray_format_enum = CUarray_format_enum(164);
+}
+impl CUarray_format_enum {
+    ///< 2 channel, 8-bit YUV packed planar format, with 4:2:2 sampling
+    pub const CU_AD_FORMAT_YUY2: CUarray_format_enum = CUarray_format_enum(165);
+}
+impl CUarray_format_enum {
+    ///< 2 channel, 10-bit YUV packed planar format, with 4:2:2 sampling
+    pub const CU_AD_FORMAT_Y210: CUarray_format_enum = CUarray_format_enum(166);
+}
+impl CUarray_format_enum {
+    ///< 2 channel, 16-bit YUV packed planar format, with 4:2:2 sampling
+    pub const CU_AD_FORMAT_Y216: CUarray_format_enum = CUarray_format_enum(167);
+}
+impl CUarray_format_enum {
+    ///< 4 channel, 8-bit YUV packed planar format, with 4:4:4 sampling
+    pub const CU_AD_FORMAT_AYUV: CUarray_format_enum = CUarray_format_enum(168);
+}
+impl CUarray_format_enum {
+    ///< 10-bit YUV packed planar format, with 4:4:4 sampling
+    pub const CU_AD_FORMAT_Y410: CUarray_format_enum = CUarray_format_enum(169);
+}
+impl CUarray_format_enum {
+    ///< 4 channel, 12-bit YUV packed planar format, with 4:4:4 sampling
+    pub const CU_AD_FORMAT_Y416: CUarray_format_enum = CUarray_format_enum(177);
+}
+impl CUarray_format_enum {
+    ///< 3 channel 8-bit YUV planar format, with 4:4:4 sampling
+    pub const CU_AD_FORMAT_Y444_PLANAR8: CUarray_format_enum = CUarray_format_enum(178);
+}
+impl CUarray_format_enum {
+    ///< 3 channel 10-bit YUV planar format, with 4:4:4 sampling
+    pub const CU_AD_FORMAT_Y444_PLANAR10: CUarray_format_enum = CUarray_format_enum(179);
+}
+impl CUarray_format_enum {
+    ///< 3 channel 8-bit YUV semi-planar format, with 4:4:4 sampling
+    pub const CU_AD_FORMAT_YUV444_8bit_SemiPlanar: CUarray_format_enum = CUarray_format_enum(
+        180,
+    );
+}
+impl CUarray_format_enum {
+    ///< 3 channel 16-bit YUV semi-planar format, with 4:4:4 sampling
+    pub const CU_AD_FORMAT_YUV444_16bit_SemiPlanar: CUarray_format_enum = CUarray_format_enum(
+        181,
+    );
+}
+impl CUarray_format_enum {
+    ///< 4 channel unorm R10G10B10A2 RGB format
+    pub const CU_AD_FORMAT_UNORM_INT_101010_2: CUarray_format_enum = CUarray_format_enum(
+        80,
+    );
+}
+impl CUarray_format_enum {
+    pub const CU_AD_FORMAT_MAX: CUarray_format_enum = CUarray_format_enum(2147483647);
+}
 #[repr(transparent)]
 /// Array formats
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
@@ -1724,11 +1809,13 @@ impl CUdevice_attribute_enum {
     );
 }
 impl CUdevice_attribute_enum {
+    ///< NUMA configuration of a device: value is of type ::CUdeviceNumaConfig enum
     pub const CU_DEVICE_ATTRIBUTE_NUMA_CONFIG: CUdevice_attribute_enum = CUdevice_attribute_enum(
         130,
     );
 }
 impl CUdevice_attribute_enum {
+    ///< NUMA node ID of the GPU memory
     pub const CU_DEVICE_ATTRIBUTE_NUMA_ID: CUdevice_attribute_enum = CUdevice_attribute_enum(
         131,
     );
@@ -1752,10 +1839,46 @@ impl CUdevice_attribute_enum {
     );
 }
 impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX: CUdevice_attribute_enum = CUdevice_attribute_enum(
+    ///< Device supports CIG with D3D12.
+    pub const CU_DEVICE_ATTRIBUTE_D3D12_CIG_SUPPORTED: CUdevice_attribute_enum = CUdevice_attribute_enum(
         135,
     );
 }
+impl CUdevice_attribute_enum {
+    ///< The returned valued shall be interpreted as a bitmask, where the individual bits are described by the ::CUmemDecompressAlgorithm enum.
+    pub const CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_ALGORITHM_MASK: CUdevice_attribute_enum = CUdevice_attribute_enum(
+        136,
+    );
+}
+impl CUdevice_attribute_enum {
+    ///< The returned valued is the maximum length in bytes of a single decompress operation that is allowed.
+    pub const CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_MAXIMUM_LENGTH: CUdevice_attribute_enum = CUdevice_attribute_enum(
+        137,
+    );
+}
+impl CUdevice_attribute_enum {
+    ///< The combined 16-bit PCI device ID and 16-bit PCI vendor ID.
+    pub const CU_DEVICE_ATTRIBUTE_GPU_PCI_DEVICE_ID: CUdevice_attribute_enum = CUdevice_attribute_enum(
+        139,
+    );
+}
+impl CUdevice_attribute_enum {
+    ///< The combined 16-bit PCI subsystem ID and 16-bit PCI subsystem vendor ID.
+    pub const CU_DEVICE_ATTRIBUTE_GPU_PCI_SUBSYSTEM_ID: CUdevice_attribute_enum = CUdevice_attribute_enum(
+        140,
+    );
+}
+impl CUdevice_attribute_enum {
+    ///< Device supports HOST_NUMA location IPC between nodes in a multi-node system.
+    pub const CU_DEVICE_ATTRIBUTE_HOST_NUMA_MULTINODE_IPC_SUPPORTED: CUdevice_attribute_enum = CUdevice_attribute_enum(
+        143,
+    );
+}
+impl CUdevice_attribute_enum {
+    pub const CU_DEVICE_ATTRIBUTE_MAX: CUdevice_attribute_enum = CUdevice_attribute_enum(
+        144,
+    );
+}
 #[repr(transparent)]
 /// Device properties
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
@@ -1911,6 +2034,12 @@ impl CUpointer_attribute_enum {
         20,
     );
 }
+impl CUpointer_attribute_enum {
+    ///< Returns in \p *data a boolean that indicates whether the pointer points to memory that is capable to be used for hardware accelerated decompression.
+    pub const CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE: CUpointer_attribute_enum = CUpointer_attribute_enum(
+        21,
+    );
+}
 #[repr(transparent)]
 /// Pointer information
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
@@ -2706,9 +2835,30 @@ impl CUjit_target_enum {
     ///< Compute device class 9.0.
     pub const CU_TARGET_COMPUTE_90: CUjit_target_enum = CUjit_target_enum(90);
 }
+impl CUjit_target_enum {
+    ///< Compute device class 10.0.
+    pub const CU_TARGET_COMPUTE_100: CUjit_target_enum = CUjit_target_enum(100);
+}
+impl CUjit_target_enum {
+    ///< Compute device class 10.1.
+    pub const CU_TARGET_COMPUTE_101: CUjit_target_enum = CUjit_target_enum(101);
+}
+impl CUjit_target_enum {
+    ///< Compute device class 12.0.
+    pub const CU_TARGET_COMPUTE_120: CUjit_target_enum = CUjit_target_enum(120);
+}
 impl CUjit_target_enum {
     pub const CU_TARGET_COMPUTE_90A: CUjit_target_enum = CUjit_target_enum(65626);
 }
+impl CUjit_target_enum {
+    pub const CU_TARGET_COMPUTE_100A: CUjit_target_enum = CUjit_target_enum(65636);
+}
+impl CUjit_target_enum {
+    pub const CU_TARGET_COMPUTE_101A: CUjit_target_enum = CUjit_target_enum(65637);
+}
+impl CUjit_target_enum {
+    pub const CU_TARGET_COMPUTE_120A: CUjit_target_enum = CUjit_target_enum(65656);
+}
 #[repr(transparent)]
 /// Online compilation targets
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
@@ -2923,7 +3073,19 @@ impl CUlimit_enum {
     pub const CU_LIMIT_PERSISTING_L2_CACHE_SIZE: CUlimit_enum = CUlimit_enum(6);
 }
 impl CUlimit_enum {
-    pub const CU_LIMIT_MAX: CUlimit_enum = CUlimit_enum(7);
+    ///< A maximum size in bytes of shared memory available to CUDA kernels on a CIG context. Can only be queried, cannot be set
+    pub const CU_LIMIT_SHMEM_SIZE: CUlimit_enum = CUlimit_enum(7);
+}
+impl CUlimit_enum {
+    ///< A non-zero value indicates this CUDA context is a CIG-enabled context. Can only be queried, cannot be set
+    pub const CU_LIMIT_CIG_ENABLED: CUlimit_enum = CUlimit_enum(8);
+}
+impl CUlimit_enum {
+    ///< When set to zero, CUDA will fail to launch a kernel on a CIG context, instead of using the fallback path, if the kernel uses more shared memory than available
+    pub const CU_LIMIT_CIG_SHMEM_FALLBACK_ENABLED: CUlimit_enum = CUlimit_enum(9);
+}
+impl CUlimit_enum {
+    pub const CU_LIMIT_MAX: CUlimit_enum = CUlimit_enum(10);
 }
 #[repr(transparent)]
 /// Limits
@@ -3179,7 +3341,7 @@ pub struct CUDA_HOST_NODE_PARAMS_v2_st {
 /// Host node parameters
 pub type CUDA_HOST_NODE_PARAMS_v2 = CUDA_HOST_NODE_PARAMS_v2_st;
 impl CUgraphConditionalNodeType_enum {
-    ///< Conditional 'if' Node. Body executed once if condition value is non-zero.
+    ///< Conditional 'if/else' Node. Body[0] executed if condition is non-zero.  If \p size == 2, an optional ELSE graph is created and this is executed if the condition is zero.
     pub const CU_GRAPH_COND_TYPE_IF: CUgraphConditionalNodeType_enum = CUgraphConditionalNodeType_enum(
         0,
     );
@@ -3190,6 +3352,12 @@ impl CUgraphConditionalNodeType_enum {
         1,
     );
 }
+impl CUgraphConditionalNodeType_enum {
+    ///< Conditional 'switch' Node. Body[n] is executed once, where 'n' is the value of the condition. If the condition does not match a body index, no body is launched.
+    pub const CU_GRAPH_COND_TYPE_SWITCH: CUgraphConditionalNodeType_enum = CUgraphConditionalNodeType_enum(
+        2,
+    );
+}
 #[repr(transparent)]
 /// Conditional node types
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
@@ -3206,7 +3374,8 @@ using ::cuGraphConditionalHandleCreate.*/
     pub handle: CUgraphConditionalHandle,
     ///< Type of conditional node.
     pub type_: CUgraphConditionalNodeType,
-    ///< Size of graph output array.  Must be 1.
+    /**< Size of graph output array.  Allowed values are 1 for CU_GRAPH_COND_TYPE_WHILE, 1 or 2
+for CU_GRAPH_COND_TYPE_IF, or any value greater than zero for CU_GRAPH_COND_TYPE_SWITCH.*/
     pub size: ::core::ffi::c_uint,
     /**< CUDA-owned array populated with conditional node child graphs during creation of the node.
 Valid for the lifetime of the conditional node.
@@ -3217,7 +3386,16 @@ memcopies, and conditionals. This applies recursively to child graphs and condit
 - All kernels, including kernels in nested conditionals or child graphs at any level,
 must belong to the same CUDA context.
 
-These graphs may be populated using graph node creation APIs or ::cuStreamBeginCaptureToGraph.*/
+These graphs may be populated using graph node creation APIs or ::cuStreamBeginCaptureToGraph.
+
+CU_GRAPH_COND_TYPE_IF:
+phGraph_out[0] is executed when the condition is non-zero.  If \p size == 2, phGraph_out[1] will
+be executed when the condition is zero.
+CU_GRAPH_COND_TYPE_WHILE:
+phGraph_out[0] is executed as long as the condition is non-zero.
+CU_GRAPH_COND_TYPE_SWITCH:
+phGraph_out[n] is executed when the condition is equal to n.  If the condition >= \p size,
+no body graph is executed.*/
     pub phGraph_out: *mut CUgraph,
     ///< Context on which to run the node.  Must match context used to create the handle and all body nodes.
     pub ctx: CUcontext,
@@ -3405,6 +3583,12 @@ impl CUgraphInstantiateResult_enum {
         4,
     );
 }
+impl CUgraphInstantiateResult_enum {
+    ///< One or more conditional handles are not associated with conditional nodes
+    pub const CUDA_GRAPH_INSTANTIATE_CONDITIONAL_HANDLE_UNUSED: CUgraphInstantiateResult_enum = CUgraphInstantiateResult_enum(
+        5,
+    );
+}
 #[repr(transparent)]
 /// Graph instantiation results
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
@@ -3654,6 +3838,47 @@ impl CUlaunchAttributeID_enum {
         10,
     );
 }
+impl CUlaunchAttributeID_enum {
+    /**< Valid for graph nodes, launches. Set
+::CUlaunchAttributeValue::preferredClusterDim
+to allow the kernel launch to specify a preferred substitute
+cluster dimension. Blocks may be grouped according to either
+the dimensions specified with this attribute (grouped into a
+"preferred substitute cluster"), or the one specified with
+::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION attribute (grouped
+into a "regular cluster"). The cluster dimensions of a
+"preferred substitute cluster" shall be an integer multiple
+greater than zero of the regular cluster dimensions. The
+device will attempt - on a best-effort basis - to group
+thread blocks into preferred clusters over grouping them
+into regular clusters. When it deems necessary (primarily
+when the device temporarily runs out of physical resources
+to launch the larger preferred clusters), the device may
+switch to launch the regular clusters instead to attempt to
+utilize as much of the physical device resources as possible.
+<br>
+Each type of cluster will have its enumeration / coordinate
+setup as if the grid consists solely of its type of cluster.
+For example, if the preferred substitute cluster dimensions
+double the regular cluster dimensions, there might be
+simultaneously a regular cluster indexed at (1,0,0), and a
+preferred cluster indexed at (1,0,0). In this example, the
+preferred substitute cluster (1,0,0) replaces regular
+clusters (2,0,0) and (3,0,0) and groups their blocks.
+<br>
+This attribute will only take effect when a regular cluster
+dimension has been specified. The preferred substitute
+cluster dimension must be an integer multiple greater than
+zero of the regular cluster dimension and must divide the
+grid. It must also be no more than `maxBlocksPerCluster`, if
+it is set in the kernel's `__launch_bounds__`. Otherwise it
+must be less than the maximum value the driver can support.
+Otherwise, setting this attribute to a value physically
+unable to fit on any particular device is permitted.*/
+    pub const CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION: CUlaunchAttributeID_enum = CUlaunchAttributeID_enum(
+        11,
+    );
+}
 impl CUlaunchAttributeID_enum {
     /**< Valid for launches. Set
 ::CUlaunchAttributeValue::launchCompletionEvent to record the
@@ -3715,10 +3940,22 @@ device-updatable nodes, the graph must be uploaded before it is launched again.*
     );
 }
 impl CUlaunchAttributeID_enum {
-    pub const CU_LAUNCH_ATTRIBUTE_MAX: CUlaunchAttributeID_enum = CUlaunchAttributeID_enum(
+    /**< Valid for launches. On devices where the L1 cache and shared memory use the
+same hardware resources, setting ::CUlaunchAttributeValue::sharedMemCarveout to a
+percentage between 0-100 signals the CUDA driver to set the shared memory carveout
+preference, in percent of the total shared memory for that kernel launch.
+This attribute takes precedence over ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT.
+This is only a hint, and the CUDA driver can choose a different configuration if
+required for the launch.*/
+    pub const CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: CUlaunchAttributeID_enum = CUlaunchAttributeID_enum(
         14,
     );
 }
+impl CUlaunchAttributeID_enum {
+    pub const CU_LAUNCH_ATTRIBUTE_MAX: CUlaunchAttributeID_enum = CUlaunchAttributeID_enum(
+        15,
+    );
+}
 #[repr(transparent)]
 /// Launch attributes enum; used as id field of ::CUlaunchAttribute
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
@@ -3747,9 +3984,7 @@ scheduling policy preference for the kernel.*/
     /**< Value of launch attribute
 ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION.*/
     pub programmaticStreamSerializationAllowed: ::core::ffi::c_int,
-    ///< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT.
     pub programmaticEvent: CUlaunchAttributeValue_union__bindgen_ty_2,
-    ///< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT.
     pub launchCompletionEvent: CUlaunchAttributeValue_union__bindgen_ty_3,
     ///< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PRIORITY. Execution priority of the kernel.
     pub priority: ::core::ffi::c_int,
@@ -3760,8 +3995,10 @@ scheduling policy preference for the kernel.*/
     /**< Value of launch attribute
 ::CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN. See::CUlaunchMemSyncDomain*/
     pub memSyncDomain: CUlaunchMemSyncDomain,
-    ///< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE.
-    pub deviceUpdatableKernelNode: CUlaunchAttributeValue_union__bindgen_ty_4,
+    pub preferredClusterDim: CUlaunchAttributeValue_union__bindgen_ty_4,
+    pub deviceUpdatableKernelNode: CUlaunchAttributeValue_union__bindgen_ty_5,
+    ///< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT.
+    pub sharedMemCarveout: ::core::ffi::c_uint,
 }
 /**  Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION that
   represents the desired cluster dimensions for the kernel. Opaque type
@@ -3779,31 +4016,54 @@ pub struct CUlaunchAttributeValue_union__bindgen_ty_1 {
     pub y: ::core::ffi::c_uint,
     pub z: ::core::ffi::c_uint,
 }
+/**  Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
+  with the following fields:
+      - \p CUevent event - Event to fire when all blocks trigger it.
+      - \p Event record flags, see ::cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
+      - \p triggerAtBlockStart - If this is set to non-0, each block launch will automatically trigger the event.*/
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct CUlaunchAttributeValue_union__bindgen_ty_2 {
-    ///< Event to fire when all blocks trigger it
     pub event: CUevent,
-    /**< Event record flags, see ::cuEventRecordWithFlags. Does not accept
-::CU_EVENT_RECORD_EXTERNAL.*/
     pub flags: ::core::ffi::c_int,
-    ///< If this is set to non-0, each block launch will automatically trigger the event
     pub triggerAtBlockStart: ::core::ffi::c_int,
 }
+/** Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT
+ with the following fields:
+     - \p CUevent event - Event to fire when the last block launches
+     - \p int flags; - Event record flags, see ::cuEventRecordWithFlags. Does not accept ::CU_EVENT_RECORD_EXTERNAL.*/
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct CUlaunchAttributeValue_union__bindgen_ty_3 {
-    ///< Event to fire when the last block launches
     pub event: CUevent,
-    ///< Event record flags, see ::cuEventRecordWithFlags. Does not accept ::CU_EVENT_RECORD_EXTERNAL.
     pub flags: ::core::ffi::c_int,
 }
+/**  Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION
+  that represents the desired preferred cluster dimensions for the kernel.
+  Opaque type with the following fields:
+      - \p x - The X dimension of the preferred cluster, in blocks. Must
+               be a divisor of the grid X dimension, and must be a
+               multiple of the \p x field of ::CUlaunchAttributeValue::clusterDim.
+      - \p y - The Y dimension of the preferred cluster, in blocks. Must
+               be a divisor of the grid Y dimension, and must be a
+               multiple of the \p y field of ::CUlaunchAttributeValue::clusterDim.
+      - \p z - The Z dimension of the preferred cluster, in blocks. Must be
+               equal to the \p z field of ::CUlaunchAttributeValue::clusterDim.*/
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct CUlaunchAttributeValue_union__bindgen_ty_4 {
-    ///< Whether or not the resulting kernel node should be device-updatable.
+    pub x: ::core::ffi::c_uint,
+    pub y: ::core::ffi::c_uint,
+    pub z: ::core::ffi::c_uint,
+}
+/**  Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE.
+  with the following fields:
+      - \p int deviceUpdatable - Whether or not the resulting kernel node should be device-updatable.
+      - \p CUgraphDeviceNode devNode - Returns a handle to pass to the various device-side update functions.*/
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUlaunchAttributeValue_union__bindgen_ty_5 {
     pub deviceUpdatable: ::core::ffi::c_int,
-    ///< Returns a handle to pass to the various device-side update functions.
     pub devNode: CUgraphDeviceNode,
 }
 /// Launch attributes union; used as value field of ::CUlaunchAttribute
@@ -3999,6 +4259,38 @@ pub union CUexecAffinityParam_st__bindgen_ty_1 {
 pub type CUexecAffinityParam_v1 = CUexecAffinityParam_st;
 /// Execution Affinity Parameters
 pub type CUexecAffinityParam = CUexecAffinityParam_v1;
+impl CUcigDataType_enum {
+    pub const CIG_DATA_TYPE_D3D12_COMMAND_QUEUE: CUcigDataType_enum = CUcigDataType_enum(
+        1,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUcigDataType_enum(pub ::core::ffi::c_uint);
+pub use self::CUcigDataType_enum as CUcigDataType;
+/// CIG Context Create Params
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUctxCigParam_st {
+    pub sharedDataType: CUcigDataType,
+    pub sharedData: *mut ::core::ffi::c_void,
+}
+/// CIG Context Create Params
+pub type CUctxCigParam = CUctxCigParam_st;
+/** Params for creating CUDA context
+ Exactly one of execAffinityParams and cigParams
+ must be non-NULL.*/
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUctxCreateParams_st {
+    pub execAffinityParams: *mut CUexecAffinityParam,
+    pub numExecAffinityParams: ::core::ffi::c_int,
+    pub cigParams: *mut CUctxCigParam,
+}
+/** Params for creating CUDA context
+ Exactly one of execAffinityParams and cigParams
+ must be non-NULL.*/
+pub type CUctxCreateParams = CUctxCreateParams_st;
 impl CUlibraryOption_enum {
     pub const CU_LIBRARY_HOST_UNIVERSAL_FUNCTION_AND_DATA_TABLE: CUlibraryOption_enum = CUlibraryOption_enum(
         0,
@@ -4763,6 +5055,21 @@ impl CUtensorMapDataType_enum {
         12,
     );
 }
+impl CUtensorMapDataType_enum {
+    pub const CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B: CUtensorMapDataType_enum = CUtensorMapDataType_enum(
+        13,
+    );
+}
+impl CUtensorMapDataType_enum {
+    pub const CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B: CUtensorMapDataType_enum = CUtensorMapDataType_enum(
+        14,
+    );
+}
+impl CUtensorMapDataType_enum {
+    pub const CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B: CUtensorMapDataType_enum = CUtensorMapDataType_enum(
+        15,
+    );
+}
 #[repr(transparent)]
 /// Tensor map data type
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
@@ -4810,6 +5117,21 @@ impl CUtensorMapSwizzle_enum {
         3,
     );
 }
+impl CUtensorMapSwizzle_enum {
+    pub const CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B: CUtensorMapSwizzle_enum = CUtensorMapSwizzle_enum(
+        4,
+    );
+}
+impl CUtensorMapSwizzle_enum {
+    pub const CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B_FLIP_8B: CUtensorMapSwizzle_enum = CUtensorMapSwizzle_enum(
+        5,
+    );
+}
+impl CUtensorMapSwizzle_enum {
+    pub const CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B: CUtensorMapSwizzle_enum = CUtensorMapSwizzle_enum(
+        6,
+    );
+}
 #[repr(transparent)]
 /// Tensor map swizzling mode of shared memory banks
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
@@ -4858,6 +5180,22 @@ impl CUtensorMapFloatOOBfill_enum {
 pub struct CUtensorMapFloatOOBfill_enum(pub ::core::ffi::c_uint);
 /// Tensor map out-of-bounds fill type
 pub use self::CUtensorMapFloatOOBfill_enum as CUtensorMapFloatOOBfill;
+impl CUtensorMapIm2ColWideMode_enum {
+    pub const CU_TENSOR_MAP_IM2COL_WIDE_MODE_W: CUtensorMapIm2ColWideMode_enum = CUtensorMapIm2ColWideMode_enum(
+        0,
+    );
+}
+impl CUtensorMapIm2ColWideMode_enum {
+    pub const CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128: CUtensorMapIm2ColWideMode_enum = CUtensorMapIm2ColWideMode_enum(
+        1,
+    );
+}
+#[repr(transparent)]
+/// Tensor map Im2Col wide mode
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUtensorMapIm2ColWideMode_enum(pub ::core::ffi::c_uint);
+/// Tensor map Im2Col wide mode
+pub use self::CUtensorMapIm2ColWideMode_enum as CUtensorMapIm2ColWideMode;
 /// GPU Direct v3 tokens
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
@@ -5505,6 +5843,18 @@ impl CUmemRangeHandleType_enum {
 pub struct CUmemRangeHandleType_enum(pub ::core::ffi::c_uint);
 /// Specifies the handle type for address range
 pub use self::CUmemRangeHandleType_enum as CUmemRangeHandleType;
+impl CUmemRangeFlags_enum {
+    ///< Indicates that DMA_BUF handle should be mapped via PCIe BAR1
+    pub const CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE: CUmemRangeFlags_enum = CUmemRangeFlags_enum(
+        1,
+    );
+}
+#[repr(transparent)]
+/// Flag for requesting handle type for address range.
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUmemRangeFlags_enum(pub ::core::ffi::c_uint);
+/// Flag for requesting handle type for address range.
+pub use self::CUmemRangeFlags_enum as CUmemRangeFlags;
 impl CUarraySparseSubresourceType_enum {
     pub const CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL: CUarraySparseSubresourceType_enum = CUarraySparseSubresourceType_enum(
         0,
@@ -5911,8 +6261,10 @@ pub struct CUmemPoolProps_st {
     pub win32SecurityAttributes: *mut ::core::ffi::c_void,
     ///< Maximum pool size. When set to 0, defaults to a system dependent value.
     pub maxSize: usize,
+    ///< Bitmask indicating intended usage for the pool.
+    pub usage: ::core::ffi::c_ushort,
     ///< reserved for future use, must be 0
-    pub reserved: [::core::ffi::c_uchar; 56usize],
+    pub reserved: [::core::ffi::c_uchar; 54usize],
 }
 /// Specifies the properties of allocations made from the pool.
 pub type CUmemPoolProps_v1 = CUmemPoolProps_st;
@@ -6330,9 +6682,258 @@ impl CUdeviceNumaConfig_enum {
     );
 }
 #[repr(transparent)]
+/// CUDA device NUMA configuration
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct CUdeviceNumaConfig_enum(pub ::core::ffi::c_uint);
+/// CUDA device NUMA configuration
 pub use self::CUdeviceNumaConfig_enum as CUdeviceNumaConfig;
+impl CUprocessState_enum {
+    ///< Default process state
+    pub const CU_PROCESS_STATE_RUNNING: CUprocessState_enum = CUprocessState_enum(0);
+}
+impl CUprocessState_enum {
+    ///< CUDA API locks are taken so further CUDA API calls will block
+    pub const CU_PROCESS_STATE_LOCKED: CUprocessState_enum = CUprocessState_enum(1);
+}
+impl CUprocessState_enum {
+    ///< Application memory contents have been checkpointed and underlying allocations and device handles have been released
+    pub const CU_PROCESS_STATE_CHECKPOINTED: CUprocessState_enum = CUprocessState_enum(
+        2,
+    );
+}
+impl CUprocessState_enum {
+    ///< Application entered an uncorrectable error during the checkpoint/restore process
+    pub const CU_PROCESS_STATE_FAILED: CUprocessState_enum = CUprocessState_enum(3);
+}
+#[repr(transparent)]
+/// CUDA Process States
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUprocessState_enum(pub ::core::ffi::c_uint);
+/// CUDA Process States
+pub use self::CUprocessState_enum as CUprocessState;
+/// CUDA checkpoint optional lock arguments
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUcheckpointLockArgs_st {
+    ///< Timeout in milliseconds to attempt to lock the process, 0 indicates no timeout
+    pub timeoutMs: ::core::ffi::c_uint,
+    ///< Reserved for future use, must be zero
+    pub reserved0: ::core::ffi::c_uint,
+    ///< Reserved for future use, must be zeroed
+    pub reserved1: [cuuint64_t; 7usize],
+}
+/// CUDA checkpoint optional lock arguments
+pub type CUcheckpointLockArgs = CUcheckpointLockArgs_st;
+/// CUDA checkpoint optional checkpoint arguments
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUcheckpointCheckpointArgs_st {
+    ///< Reserved for future use, must be zeroed
+    pub reserved: [cuuint64_t; 8usize],
+}
+/// CUDA checkpoint optional checkpoint arguments
+pub type CUcheckpointCheckpointArgs = CUcheckpointCheckpointArgs_st;
+/// CUDA checkpoint optional restore arguments
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUcheckpointRestoreArgs_st {
+    ///< Reserved for future use, must be zeroed
+    pub reserved: [cuuint64_t; 8usize],
+}
+/// CUDA checkpoint optional restore arguments
+pub type CUcheckpointRestoreArgs = CUcheckpointRestoreArgs_st;
+/// CUDA checkpoint optional unlock arguments
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUcheckpointUnlockArgs_st {
+    ///< Reserved for future use, must be zeroed
+    pub reserved: [cuuint64_t; 8usize],
+}
+/// CUDA checkpoint optional unlock arguments
+pub type CUcheckpointUnlockArgs = CUcheckpointUnlockArgs_st;
+impl CUmemcpyFlags_enum {
+    pub const CU_MEMCPY_FLAG_DEFAULT: CUmemcpyFlags_enum = CUmemcpyFlags_enum(0);
+}
+impl CUmemcpyFlags_enum {
+    /// Hint to the driver to try and overlap the copy with compute work on the SMs.
+    pub const CU_MEMCPY_FLAG_PREFER_OVERLAP_WITH_COMPUTE: CUmemcpyFlags_enum = CUmemcpyFlags_enum(
+        1,
+    );
+}
+#[repr(transparent)]
+/// Flags to specify for copies within a batch. For more details see ::cuMemcpyBatchAsync.
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUmemcpyFlags_enum(pub ::core::ffi::c_uint);
+/// Flags to specify for copies within a batch. For more details see ::cuMemcpyBatchAsync.
+pub use self::CUmemcpyFlags_enum as CUmemcpyFlags;
+impl CUmemcpySrcAccessOrder_enum {
+    /// Default invalid.
+    pub const CU_MEMCPY_SRC_ACCESS_ORDER_INVALID: CUmemcpySrcAccessOrder_enum = CUmemcpySrcAccessOrder_enum(
+        0,
+    );
+}
+impl CUmemcpySrcAccessOrder_enum {
+    /// Indicates that access to the source pointer must be in stream order.
+    pub const CU_MEMCPY_SRC_ACCESS_ORDER_STREAM: CUmemcpySrcAccessOrder_enum = CUmemcpySrcAccessOrder_enum(
+        1,
+    );
+}
+impl CUmemcpySrcAccessOrder_enum {
+    /** Indicates that access to the source pointer can be out of stream order and
+ all accesses must be complete before the API call returns. This flag is suited for
+ ephemeral sources (ex., stack variables) when it's known that no prior operations
+ in the stream can be accessing the memory and also that the lifetime of the memory
+ is limited to the scope that the source variable was declared in. Specifying
+ this flag allows the driver to optimize the copy and removes the need for the user
+ to synchronize the stream after the API call.*/
+    pub const CU_MEMCPY_SRC_ACCESS_ORDER_DURING_API_CALL: CUmemcpySrcAccessOrder_enum = CUmemcpySrcAccessOrder_enum(
+        2,
+    );
+}
+impl CUmemcpySrcAccessOrder_enum {
+    /** Indicates that access to the source pointer can be out of stream order and the accesses
+ can happen even after the API call returns. This flag is suited for host pointers
+ allocated outside CUDA (ex., via malloc) when it's known that no prior operations
+ in the stream can be accessing the memory. Specifying this flag allows the driver
+ to optimize the copy on certain platforms.*/
+    pub const CU_MEMCPY_SRC_ACCESS_ORDER_ANY: CUmemcpySrcAccessOrder_enum = CUmemcpySrcAccessOrder_enum(
+        3,
+    );
+}
+impl CUmemcpySrcAccessOrder_enum {
+    /** Indicates that access to the source pointer can be out of stream order and the accesses
+ can happen even after the API call returns. This flag is suited for host pointers
+ allocated outside CUDA (ex., via malloc) when it's known that no prior operations
+ in the stream can be accessing the memory. Specifying this flag allows the driver
+ to optimize the copy on certain platforms.*/
+    pub const CU_MEMCPY_SRC_ACCESS_ORDER_MAX: CUmemcpySrcAccessOrder_enum = CUmemcpySrcAccessOrder_enum(
+        2147483647,
+    );
+}
+#[repr(transparent)]
+/** These flags allow applications to convey the source access ordering CUDA must maintain.
+ The destination will always be accessed in stream order.*/
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUmemcpySrcAccessOrder_enum(pub ::core::ffi::c_uint);
+/** These flags allow applications to convey the source access ordering CUDA must maintain.
+ The destination will always be accessed in stream order.*/
+pub use self::CUmemcpySrcAccessOrder_enum as CUmemcpySrcAccessOrder;
+/// Attributes specific to copies within a batch. For more details on usage see ::cuMemcpyBatchAsync.
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUmemcpyAttributes_st {
+    ///< Source access ordering to be observed for copies with this attribute.
+    pub srcAccessOrder: CUmemcpySrcAccessOrder,
+    ///< Hint location for the source operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA.
+    pub srcLocHint: CUmemLocation,
+    ///< Hint location for the destination operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA.
+    pub dstLocHint: CUmemLocation,
+    ///< Additional flags for copies with this attribute. See ::CUmemcpyFlags
+    pub flags: ::core::ffi::c_uint,
+}
+/// Attributes specific to copies within a batch. For more details on usage see ::cuMemcpyBatchAsync.
+pub type CUmemcpyAttributes_v1 = CUmemcpyAttributes_st;
+/// Attributes specific to copies within a batch. For more details on usage see ::cuMemcpyBatchAsync.
+pub type CUmemcpyAttributes = CUmemcpyAttributes_v1;
+impl CUmemcpy3DOperandType_enum {
+    ///< Memcpy operand is a valid pointer.
+    pub const CU_MEMCPY_OPERAND_TYPE_POINTER: CUmemcpy3DOperandType_enum = CUmemcpy3DOperandType_enum(
+        1,
+    );
+}
+impl CUmemcpy3DOperandType_enum {
+    ///< Memcpy operand is a CUarray.
+    pub const CU_MEMCPY_OPERAND_TYPE_ARRAY: CUmemcpy3DOperandType_enum = CUmemcpy3DOperandType_enum(
+        2,
+    );
+}
+impl CUmemcpy3DOperandType_enum {
+    pub const CU_MEMCPY_OPERAND_TYPE_MAX: CUmemcpy3DOperandType_enum = CUmemcpy3DOperandType_enum(
+        2147483647,
+    );
+}
+#[repr(transparent)]
+/// These flags allow applications to convey the operand type for individual copies specified in ::cuMemcpy3DBatchAsync.
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUmemcpy3DOperandType_enum(pub ::core::ffi::c_uint);
+/// These flags allow applications to convey the operand type for individual copies specified in ::cuMemcpy3DBatchAsync.
+pub use self::CUmemcpy3DOperandType_enum as CUmemcpy3DOperandType;
+/// Struct representing offset into a CUarray in elements
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUoffset3D_st {
+    pub x: usize,
+    pub y: usize,
+    pub z: usize,
+}
+/// Struct representing offset into a CUarray in elements
+pub type CUoffset3D_v1 = CUoffset3D_st;
+/// Struct representing offset into a CUarray in elements
+pub type CUoffset3D = CUoffset3D_v1;
+/// Struct representing width/height/depth of a CUarray in elements
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUextent3D_st {
+    pub width: usize,
+    pub height: usize,
+    pub depth: usize,
+}
+/// Struct representing width/height/depth of a CUarray in elements
+pub type CUextent3D_v1 = CUextent3D_st;
+/// Struct representing width/height/depth of a CUarray in elements
+pub type CUextent3D = CUextent3D_v1;
+/// Struct representing an operand for copy with ::cuMemcpy3DBatchAsync
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub struct CUmemcpy3DOperand_st {
+    pub type_: CUmemcpy3DOperandType,
+    pub op: CUmemcpy3DOperand_st__bindgen_ty_1,
+}
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub union CUmemcpy3DOperand_st__bindgen_ty_1 {
+    pub ptr: CUmemcpy3DOperand_st__bindgen_ty_1__bindgen_ty_1,
+    pub array: CUmemcpy3DOperand_st__bindgen_ty_1__bindgen_ty_2,
+}
+/// Struct representing an operand when ::CUmemcpy3DOperand::type is ::CU_MEMCPY_OPERAND_TYPE_POINTER
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUmemcpy3DOperand_st__bindgen_ty_1__bindgen_ty_1 {
+    pub ptr: CUdeviceptr,
+    ///< Length of each row in elements.
+    pub rowLength: usize,
+    ///< Height of each layer in elements.
+    pub layerHeight: usize,
+    ///< Hint location for the operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA.
+    pub locHint: CUmemLocation,
+}
+/// Struct representing an operand when ::CUmemcpy3DOperand::type is ::CU_MEMCPY_OPERAND_TYPE_ARRAY
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUmemcpy3DOperand_st__bindgen_ty_1__bindgen_ty_2 {
+    pub array: CUarray,
+    pub offset: CUoffset3D,
+}
+/// Struct representing an operand for copy with ::cuMemcpy3DBatchAsync
+pub type CUmemcpy3DOperand_v1 = CUmemcpy3DOperand_st;
+/// Struct representing an operand for copy with ::cuMemcpy3DBatchAsync
+pub type CUmemcpy3DOperand = CUmemcpy3DOperand_v1;
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub struct CUDA_MEMCPY3D_BATCH_OP_st {
+    ///< Source memcpy operand.
+    pub src: CUmemcpy3DOperand,
+    ///< Destination memcpy operand.
+    pub dst: CUmemcpy3DOperand,
+    ///< Extents of the memcpy between src and dst. The width, height and depth components must not be 0.
+    pub extent: CUextent3D,
+    ///< Source access ordering to be observed for copy from src to dst.
+    pub srcAccessOrder: CUmemcpySrcAccessOrder,
+    ///< Additional flags for copies with this attribute. See ::CUmemcpyFlags
+    pub flags: ::core::ffi::c_uint,
+}
+pub type CUDA_MEMCPY3D_BATCH_OP_v1 = CUDA_MEMCPY3D_BATCH_OP_st;
+pub type CUDA_MEMCPY3D_BATCH_OP = CUDA_MEMCPY3D_BATCH_OP_v1;
 impl CUmoduleLoadingMode_enum {
     ///< Lazy Kernel Loading is not enabled
     pub const CU_MODULE_EAGER_LOADING: CUmoduleLoadingMode_enum = CUmoduleLoadingMode_enum(
@@ -6351,6 +6952,61 @@ impl CUmoduleLoadingMode_enum {
 pub struct CUmoduleLoadingMode_enum(pub ::core::ffi::c_uint);
 /// CUDA Lazy Loading status
 pub use self::CUmoduleLoadingMode_enum as CUmoduleLoadingMode;
+impl CUmemDecompressAlgorithm_enum {
+    ///< Decompression is unsupported.
+    pub const CU_MEM_DECOMPRESS_UNSUPPORTED: CUmemDecompressAlgorithm_enum = CUmemDecompressAlgorithm_enum(
+        0,
+    );
+}
+impl CUmemDecompressAlgorithm_enum {
+    ///< Deflate is supported.
+    pub const CU_MEM_DECOMPRESS_ALGORITHM_DEFLATE: CUmemDecompressAlgorithm_enum = CUmemDecompressAlgorithm_enum(
+        1,
+    );
+}
+impl CUmemDecompressAlgorithm_enum {
+    ///< Snappy is supported.
+    pub const CU_MEM_DECOMPRESS_ALGORITHM_SNAPPY: CUmemDecompressAlgorithm_enum = CUmemDecompressAlgorithm_enum(
+        2,
+    );
+}
+#[repr(transparent)]
+/// \brief Bitmasks for CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_ALGORITHM_MASK.
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUmemDecompressAlgorithm_enum(pub ::core::ffi::c_uint);
+/// \brief Bitmasks for CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_ALGORITHM_MASK.
+pub use self::CUmemDecompressAlgorithm_enum as CUmemDecompressAlgorithm;
+/** \brief Structure describing the parameters that compose a single
+        decompression operation.*/
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUmemDecompressParams_st {
+    /** The number of bytes to be read and decompressed from
+  ::CUmemDecompressParams_st.src.*/
+    pub srcNumBytes: usize,
+    /** The number of bytes that the decompression operation will be expected to
+  write to ::CUmemDecompressParams_st.dst. This value is optional; if
+  present, it may be used by the CUDA driver as a heuristic for scheduling
+  the individual decompression operations.*/
+    pub dstNumBytes: usize,
+    /** After the decompression operation has completed, the actual number of
+ bytes written to ::CUmemDecompressParams.dst will be recorded as a 32-bit
+ unsigned integer in the memory at this address.*/
+    pub dstActBytes: *mut cuuint32_t,
+    /** Pointer to a buffer of at least ::CUmemDecompressParams_st.srcNumBytes
+ compressed bytes.*/
+    pub src: *const ::core::ffi::c_void,
+    /** Pointer to a buffer where the decompressed data will be written. The
+ number of bytes written to this location will be recorded in the memory
+ pointed to by ::CUmemDecompressParams_st.dstActBytes*/
+    pub dst: *mut ::core::ffi::c_void,
+    /// The decompression algorithm to use.
+    pub algo: CUmemDecompressAlgorithm,
+    pub padding: [::core::ffi::c_uchar; 20usize],
+}
+/** \brief Structure describing the parameters that compose a single
+        decompression operation.*/
+pub type CUmemDecompressParams = CUmemDecompressParams_st;
 impl CUfunctionLoadingState_enum {
     pub const CU_FUNCTION_LOADING_STATE_UNLOADED: CUfunctionLoadingState_enum = CUfunctionLoadingState_enum(
         0,
@@ -6397,7 +7053,12 @@ impl CUcoredumpSettings_enum {
     pub const CU_COREDUMP_PIPE: CUcoredumpSettings_enum = CUcoredumpSettings_enum(6);
 }
 impl CUcoredumpSettings_enum {
-    pub const CU_COREDUMP_MAX: CUcoredumpSettings_enum = CUcoredumpSettings_enum(7);
+    pub const CU_COREDUMP_GENERATION_FLAGS: CUcoredumpSettings_enum = CUcoredumpSettings_enum(
+        7,
+    );
+}
+impl CUcoredumpSettings_enum {
+    pub const CU_COREDUMP_MAX: CUcoredumpSettings_enum = CUcoredumpSettings_enum(8);
 }
 #[repr(transparent)]
 /// Flags for choosing a coredump attribute to get/set
@@ -6405,15 +7066,50 @@ impl CUcoredumpSettings_enum {
 pub struct CUcoredumpSettings_enum(pub ::core::ffi::c_uint);
 /// Flags for choosing a coredump attribute to get/set
 pub use self::CUcoredumpSettings_enum as CUcoredumpSettings;
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct CUgreenCtx_st {
-    _unused: [u8; 0],
+impl CUCoredumpGenerationFlags {
+    pub const CU_COREDUMP_DEFAULT_FLAGS: CUCoredumpGenerationFlags = CUCoredumpGenerationFlags(
+        0,
+    );
 }
-/** \typedef typedef struct CUgreenCtx_st* CUgreenCtx
- A green context handle. This handle can be used safely from only one CPU thread at a time.
- Created via ::cuGreenCtxCreate*/
-pub type CUgreenCtx = *mut CUgreenCtx_st;
+impl CUCoredumpGenerationFlags {
+    pub const CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES: CUCoredumpGenerationFlags = CUCoredumpGenerationFlags(
+        1,
+    );
+}
+impl CUCoredumpGenerationFlags {
+    pub const CU_COREDUMP_SKIP_GLOBAL_MEMORY: CUCoredumpGenerationFlags = CUCoredumpGenerationFlags(
+        2,
+    );
+}
+impl CUCoredumpGenerationFlags {
+    pub const CU_COREDUMP_SKIP_SHARED_MEMORY: CUCoredumpGenerationFlags = CUCoredumpGenerationFlags(
+        4,
+    );
+}
+impl CUCoredumpGenerationFlags {
+    pub const CU_COREDUMP_SKIP_LOCAL_MEMORY: CUCoredumpGenerationFlags = CUCoredumpGenerationFlags(
+        8,
+    );
+}
+impl CUCoredumpGenerationFlags {
+    pub const CU_COREDUMP_SKIP_ABORT: CUCoredumpGenerationFlags = CUCoredumpGenerationFlags(
+        16,
+    );
+}
+impl CUCoredumpGenerationFlags {
+    pub const CU_COREDUMP_SKIP_CONSTBANK_MEMORY: CUCoredumpGenerationFlags = CUCoredumpGenerationFlags(
+        32,
+    );
+}
+impl CUCoredumpGenerationFlags {
+    pub const CU_COREDUMP_LIGHTWEIGHT_FLAGS: CUCoredumpGenerationFlags = CUCoredumpGenerationFlags(
+        47,
+    );
+}
+#[repr(transparent)]
+/// Flags for controlling coredump contents
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUCoredumpGenerationFlags(pub ::core::ffi::c_uint);
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct CUdevResourceDesc_st {
@@ -6432,6 +7128,19 @@ impl CUgreenCtxCreate_flags {
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct CUgreenCtxCreate_flags(pub ::core::ffi::c_uint);
+impl CUdevSmResourceSplit_flags {
+    pub const CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING: CUdevSmResourceSplit_flags = CUdevSmResourceSplit_flags(
+        1,
+    );
+}
+impl CUdevSmResourceSplit_flags {
+    pub const CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE: CUdevSmResourceSplit_flags = CUdevSmResourceSplit_flags(
+        2,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUdevSmResourceSplit_flags(pub ::core::ffi::c_uint);
 impl CUdevResourceType {
     pub const CU_DEV_RESOURCE_TYPE_INVALID: CUdevResourceType = CUdevResourceType(0);
 }
@@ -7432,10 +8141,28 @@ impl CUeglColorFormat_enum {
     );
 }
 impl CUeglColorFormat_enum {
-    pub const CU_EGL_COLOR_FORMAT_MAX: CUeglColorFormat_enum = CUeglColorFormat_enum(
+    ///< Y, U, V in one surface, interleaved as UYVY in one channel.
+    pub const CU_EGL_COLOR_FORMAT_UYVY_709: CUeglColorFormat_enum = CUeglColorFormat_enum(
         114,
     );
 }
+impl CUeglColorFormat_enum {
+    ///< Extended Range Y, U, V in one surface, interleaved as UYVY in one channel.
+    pub const CU_EGL_COLOR_FORMAT_UYVY_709_ER: CUeglColorFormat_enum = CUeglColorFormat_enum(
+        115,
+    );
+}
+impl CUeglColorFormat_enum {
+    ///< Y, U, V in one surface, interleaved as UYVY in one channel.
+    pub const CU_EGL_COLOR_FORMAT_UYVY_2020: CUeglColorFormat_enum = CUeglColorFormat_enum(
+        116,
+    );
+}
+impl CUeglColorFormat_enum {
+    pub const CU_EGL_COLOR_FORMAT_MAX: CUeglColorFormat_enum = CUeglColorFormat_enum(
+        117,
+    );
+}
 #[repr(transparent)]
 /** CUDA EGL Color Format - The different planar and multiplanar formats currently supported for CUDA_EGL interops.
  Three channel formats are currently not supported for ::CU_EGL_FRAME_TYPE_ARRAY*/
@@ -7587,6 +8314,145 @@ pub type VdpGetProcAddress = ::core::option::Option<
         function_pointer: *mut *mut ::core::ffi::c_void,
     ) -> VdpStatus,
 >;
+impl cudaDataType_t {
+    pub const CUDA_R_16F: cudaDataType_t = cudaDataType_t(2);
+}
+impl cudaDataType_t {
+    pub const CUDA_C_16F: cudaDataType_t = cudaDataType_t(6);
+}
+impl cudaDataType_t {
+    pub const CUDA_R_16BF: cudaDataType_t = cudaDataType_t(14);
+}
+impl cudaDataType_t {
+    pub const CUDA_C_16BF: cudaDataType_t = cudaDataType_t(15);
+}
+impl cudaDataType_t {
+    pub const CUDA_R_32F: cudaDataType_t = cudaDataType_t(0);
+}
+impl cudaDataType_t {
+    pub const CUDA_C_32F: cudaDataType_t = cudaDataType_t(4);
+}
+impl cudaDataType_t {
+    pub const CUDA_R_64F: cudaDataType_t = cudaDataType_t(1);
+}
+impl cudaDataType_t {
+    pub const CUDA_C_64F: cudaDataType_t = cudaDataType_t(5);
+}
+impl cudaDataType_t {
+    pub const CUDA_R_4I: cudaDataType_t = cudaDataType_t(16);
+}
+impl cudaDataType_t {
+    pub const CUDA_C_4I: cudaDataType_t = cudaDataType_t(17);
+}
+impl cudaDataType_t {
+    pub const CUDA_R_4U: cudaDataType_t = cudaDataType_t(18);
+}
+impl cudaDataType_t {
+    pub const CUDA_C_4U: cudaDataType_t = cudaDataType_t(19);
+}
+impl cudaDataType_t {
+    pub const CUDA_R_8I: cudaDataType_t = cudaDataType_t(3);
+}
+impl cudaDataType_t {
+    pub const CUDA_C_8I: cudaDataType_t = cudaDataType_t(7);
+}
+impl cudaDataType_t {
+    pub const CUDA_R_8U: cudaDataType_t = cudaDataType_t(8);
+}
+impl cudaDataType_t {
+    pub const CUDA_C_8U: cudaDataType_t = cudaDataType_t(9);
+}
+impl cudaDataType_t {
+    pub const CUDA_R_16I: cudaDataType_t = cudaDataType_t(20);
+}
+impl cudaDataType_t {
+    pub const CUDA_C_16I: cudaDataType_t = cudaDataType_t(21);
+}
+impl cudaDataType_t {
+    pub const CUDA_R_16U: cudaDataType_t = cudaDataType_t(22);
+}
+impl cudaDataType_t {
+    pub const CUDA_C_16U: cudaDataType_t = cudaDataType_t(23);
+}
+impl cudaDataType_t {
+    pub const CUDA_R_32I: cudaDataType_t = cudaDataType_t(10);
+}
+impl cudaDataType_t {
+    pub const CUDA_C_32I: cudaDataType_t = cudaDataType_t(11);
+}
+impl cudaDataType_t {
+    pub const CUDA_R_32U: cudaDataType_t = cudaDataType_t(12);
+}
+impl cudaDataType_t {
+    pub const CUDA_C_32U: cudaDataType_t = cudaDataType_t(13);
+}
+impl cudaDataType_t {
+    pub const CUDA_R_64I: cudaDataType_t = cudaDataType_t(24);
+}
+impl cudaDataType_t {
+    pub const CUDA_C_64I: cudaDataType_t = cudaDataType_t(25);
+}
+impl cudaDataType_t {
+    pub const CUDA_R_64U: cudaDataType_t = cudaDataType_t(26);
+}
+impl cudaDataType_t {
+    pub const CUDA_C_64U: cudaDataType_t = cudaDataType_t(27);
+}
+impl cudaDataType_t {
+    pub const CUDA_R_8F_E4M3: cudaDataType_t = cudaDataType_t(28);
+}
+impl cudaDataType_t {
+    pub const CUDA_R_8F_UE4M3: cudaDataType_t = cudaDataType_t(28);
+}
+impl cudaDataType_t {
+    pub const CUDA_R_8F_E5M2: cudaDataType_t = cudaDataType_t(29);
+}
+impl cudaDataType_t {
+    pub const CUDA_R_8F_UE8M0: cudaDataType_t = cudaDataType_t(30);
+}
+impl cudaDataType_t {
+    pub const CUDA_R_6F_E2M3: cudaDataType_t = cudaDataType_t(31);
+}
+impl cudaDataType_t {
+    pub const CUDA_R_6F_E3M2: cudaDataType_t = cudaDataType_t(32);
+}
+impl cudaDataType_t {
+    pub const CUDA_R_4F_E2M1: cudaDataType_t = cudaDataType_t(33);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudaDataType_t(pub ::core::ffi::c_uint);
+pub use self::cudaDataType_t as cudaDataType;
+impl libraryPropertyType_t {
+    pub const MAJOR_VERSION: libraryPropertyType_t = libraryPropertyType_t(0);
+}
+impl libraryPropertyType_t {
+    pub const MINOR_VERSION: libraryPropertyType_t = libraryPropertyType_t(1);
+}
+impl libraryPropertyType_t {
+    pub const PATCH_LEVEL: libraryPropertyType_t = libraryPropertyType_t(2);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct libraryPropertyType_t(pub ::core::ffi::c_uint);
+pub use self::libraryPropertyType_t as libraryPropertyType;
+#[repr(C)]
+#[repr(align(8))]
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub struct float2 {
+    pub x: f32,
+    pub y: f32,
+}
+#[repr(C)]
+#[repr(align(16))]
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub struct double2 {
+    pub x: f64,
+    pub y: f64,
+}
+pub type cuFloatComplex = float2;
+pub type cuDoubleComplex = double2;
+pub type cuComplex = cuFloatComplex;
 impl CUerror {
     pub const INVALID_VALUE: CUerror = CUerror(unsafe {
         ::core::num::NonZeroU32::new_unchecked(1)
@@ -7699,6 +8565,9 @@ impl CUerror {
     pub const UNSUPPORTED_DEVSIDE_SYNC: CUerror = CUerror(unsafe {
         ::core::num::NonZeroU32::new_unchecked(225)
     });
+    pub const CONTAINED: CUerror = CUerror(unsafe {
+        ::core::num::NonZeroU32::new_unchecked(226)
+    });
     pub const INVALID_SOURCE: CUerror = CUerror(unsafe {
         ::core::num::NonZeroU32::new_unchecked(300)
     });
@@ -7786,6 +8655,9 @@ impl CUerror {
     pub const COOPERATIVE_LAUNCH_TOO_LARGE: CUerror = CUerror(unsafe {
         ::core::num::NonZeroU32::new_unchecked(720)
     });
+    pub const TENSOR_MEMORY_LEAK: CUerror = CUerror(unsafe {
+        ::core::num::NonZeroU32::new_unchecked(721)
+    });
     pub const NOT_PERMITTED: CUerror = CUerror(unsafe {
         ::core::num::NonZeroU32::new_unchecked(800)
     });
@@ -7873,6 +8745,9 @@ impl CUerror {
     pub const INVALID_RESOURCE_CONFIGURATION: CUerror = CUerror(unsafe {
         ::core::num::NonZeroU32::new_unchecked(915)
     });
+    pub const KEY_ROTATION: CUerror = CUerror(unsafe {
+        ::core::num::NonZeroU32::new_unchecked(916)
+    });
     pub const UNKNOWN: CUerror = CUerror(unsafe {
         ::core::num::NonZeroU32::new_unchecked(999)
     });
@@ -7953,6 +8828,7 @@ pub trait CUresultConsts {
     const ERROR_UNSUPPORTED_DEVSIDE_SYNC: CUresult = CUresult::Err(
         CUerror::UNSUPPORTED_DEVSIDE_SYNC,
     );
+    const ERROR_CONTAINED: CUresult = CUresult::Err(CUerror::CONTAINED);
     const ERROR_INVALID_SOURCE: CUresult = CUresult::Err(CUerror::INVALID_SOURCE);
     const ERROR_FILE_NOT_FOUND: CUresult = CUresult::Err(CUerror::FILE_NOT_FOUND);
     const ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: CUresult = CUresult::Err(
@@ -8012,6 +8888,9 @@ pub trait CUresultConsts {
     const ERROR_COOPERATIVE_LAUNCH_TOO_LARGE: CUresult = CUresult::Err(
         CUerror::COOPERATIVE_LAUNCH_TOO_LARGE,
     );
+    const ERROR_TENSOR_MEMORY_LEAK: CUresult = CUresult::Err(
+        CUerror::TENSOR_MEMORY_LEAK,
+    );
     const ERROR_NOT_PERMITTED: CUresult = CUresult::Err(CUerror::NOT_PERMITTED);
     const ERROR_NOT_SUPPORTED: CUresult = CUresult::Err(CUerror::NOT_SUPPORTED);
     const ERROR_SYSTEM_NOT_READY: CUresult = CUresult::Err(CUerror::SYSTEM_NOT_READY);
@@ -8083,6 +8962,7 @@ pub trait CUresultConsts {
     const ERROR_INVALID_RESOURCE_CONFIGURATION: CUresult = CUresult::Err(
         CUerror::INVALID_RESOURCE_CONFIGURATION,
     );
+    const ERROR_KEY_ROTATION: CUresult = CUresult::Err(CUerror::KEY_ROTATION);
     const ERROR_UNKNOWN: CUresult = CUresult::Err(CUerror::UNKNOWN);
 }
 impl CUresultConsts for CUresult {}
diff --git a/cuda_types/src/cudnn.rs b/cuda_types/src/cudnn.rs
new file mode 100644
index 0000000..7133443
--- /dev/null
+++ b/cuda_types/src/cudnn.rs
@@ -0,0 +1,1478 @@
+// Generated automatically by zluda_bindgen
+// DO NOT EDIT MANUALLY
+#![allow(warnings)]
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnContext {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnRuntimeTag_t {
+    _unused: [u8; 0],
+}
+impl cudnnErrQueryMode_t {
+    pub const CUDNN_ERRQUERY_RAWCODE: cudnnErrQueryMode_t = cudnnErrQueryMode_t(0);
+}
+impl cudnnErrQueryMode_t {
+    pub const CUDNN_ERRQUERY_NONBLOCKING: cudnnErrQueryMode_t = cudnnErrQueryMode_t(1);
+}
+impl cudnnErrQueryMode_t {
+    pub const CUDNN_ERRQUERY_BLOCKING: cudnnErrQueryMode_t = cudnnErrQueryMode_t(2);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnErrQueryMode_t(pub ::core::ffi::c_uint);
+impl cudnnMathType_t {
+    pub const CUDNN_DEFAULT_MATH: cudnnMathType_t = cudnnMathType_t(0);
+}
+impl cudnnMathType_t {
+    pub const CUDNN_TENSOR_OP_MATH: cudnnMathType_t = cudnnMathType_t(1);
+}
+impl cudnnMathType_t {
+    pub const CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION: cudnnMathType_t = cudnnMathType_t(
+        2,
+    );
+}
+impl cudnnMathType_t {
+    pub const CUDNN_FMA_MATH: cudnnMathType_t = cudnnMathType_t(3);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnMathType_t(pub ::core::ffi::c_uint);
+impl cudnnNanPropagation_t {
+    pub const CUDNN_NOT_PROPAGATE_NAN: cudnnNanPropagation_t = cudnnNanPropagation_t(0);
+}
+impl cudnnNanPropagation_t {
+    pub const CUDNN_PROPAGATE_NAN: cudnnNanPropagation_t = cudnnNanPropagation_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnNanPropagation_t(pub ::core::ffi::c_uint);
+impl cudnnTensorFormat_t {
+    pub const CUDNN_TENSOR_NCHW: cudnnTensorFormat_t = cudnnTensorFormat_t(0);
+}
+impl cudnnTensorFormat_t {
+    pub const CUDNN_TENSOR_NHWC: cudnnTensorFormat_t = cudnnTensorFormat_t(1);
+}
+impl cudnnTensorFormat_t {
+    pub const CUDNN_TENSOR_NCHW_VECT_C: cudnnTensorFormat_t = cudnnTensorFormat_t(2);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnTensorFormat_t(pub ::core::ffi::c_uint);
+impl cudnnReduceTensorOp_t {
+    pub const CUDNN_REDUCE_TENSOR_ADD: cudnnReduceTensorOp_t = cudnnReduceTensorOp_t(0);
+}
+impl cudnnReduceTensorOp_t {
+    pub const CUDNN_REDUCE_TENSOR_MUL: cudnnReduceTensorOp_t = cudnnReduceTensorOp_t(1);
+}
+impl cudnnReduceTensorOp_t {
+    pub const CUDNN_REDUCE_TENSOR_MIN: cudnnReduceTensorOp_t = cudnnReduceTensorOp_t(2);
+}
+impl cudnnReduceTensorOp_t {
+    pub const CUDNN_REDUCE_TENSOR_MAX: cudnnReduceTensorOp_t = cudnnReduceTensorOp_t(3);
+}
+impl cudnnReduceTensorOp_t {
+    pub const CUDNN_REDUCE_TENSOR_AMAX: cudnnReduceTensorOp_t = cudnnReduceTensorOp_t(4);
+}
+impl cudnnReduceTensorOp_t {
+    pub const CUDNN_REDUCE_TENSOR_AVG: cudnnReduceTensorOp_t = cudnnReduceTensorOp_t(5);
+}
+impl cudnnReduceTensorOp_t {
+    pub const CUDNN_REDUCE_TENSOR_NORM1: cudnnReduceTensorOp_t = cudnnReduceTensorOp_t(
+        6,
+    );
+}
+impl cudnnReduceTensorOp_t {
+    pub const CUDNN_REDUCE_TENSOR_NORM2: cudnnReduceTensorOp_t = cudnnReduceTensorOp_t(
+        7,
+    );
+}
+impl cudnnReduceTensorOp_t {
+    pub const CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS: cudnnReduceTensorOp_t = cudnnReduceTensorOp_t(
+        8,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnReduceTensorOp_t(pub ::core::ffi::c_uint);
+impl cudnnActivationMode_t {
+    pub const CUDNN_ACTIVATION_SIGMOID: cudnnActivationMode_t = cudnnActivationMode_t(0);
+}
+impl cudnnActivationMode_t {
+    pub const CUDNN_ACTIVATION_RELU: cudnnActivationMode_t = cudnnActivationMode_t(1);
+}
+impl cudnnActivationMode_t {
+    pub const CUDNN_ACTIVATION_TANH: cudnnActivationMode_t = cudnnActivationMode_t(2);
+}
+impl cudnnActivationMode_t {
+    pub const CUDNN_ACTIVATION_CLIPPED_RELU: cudnnActivationMode_t = cudnnActivationMode_t(
+        3,
+    );
+}
+impl cudnnActivationMode_t {
+    pub const CUDNN_ACTIVATION_ELU: cudnnActivationMode_t = cudnnActivationMode_t(4);
+}
+impl cudnnActivationMode_t {
+    pub const CUDNN_ACTIVATION_IDENTITY: cudnnActivationMode_t = cudnnActivationMode_t(
+        5,
+    );
+}
+impl cudnnActivationMode_t {
+    pub const CUDNN_ACTIVATION_SWISH: cudnnActivationMode_t = cudnnActivationMode_t(6);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnActivationMode_t(pub ::core::ffi::c_uint);
+impl cudnnSeverity_t {
+    pub const CUDNN_SEV_FATAL: cudnnSeverity_t = cudnnSeverity_t(0);
+}
+impl cudnnSeverity_t {
+    pub const CUDNN_SEV_ERROR: cudnnSeverity_t = cudnnSeverity_t(1);
+}
+impl cudnnSeverity_t {
+    pub const CUDNN_SEV_WARNING: cudnnSeverity_t = cudnnSeverity_t(2);
+}
+impl cudnnSeverity_t {
+    pub const CUDNN_SEV_INFO: cudnnSeverity_t = cudnnSeverity_t(3);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnSeverity_t(pub ::core::ffi::c_uint);
+impl cudnnConvolutionMode_t {
+    pub const CUDNN_CONVOLUTION: cudnnConvolutionMode_t = cudnnConvolutionMode_t(0);
+}
+impl cudnnConvolutionMode_t {
+    pub const CUDNN_CROSS_CORRELATION: cudnnConvolutionMode_t = cudnnConvolutionMode_t(
+        1,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnConvolutionMode_t(pub ::core::ffi::c_uint);
+impl cudnnReorderType_t {
+    pub const CUDNN_DEFAULT_REORDER: cudnnReorderType_t = cudnnReorderType_t(0);
+}
+impl cudnnReorderType_t {
+    pub const CUDNN_NO_REORDER: cudnnReorderType_t = cudnnReorderType_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnReorderType_t(pub ::core::ffi::c_uint);
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnFractionStruct {
+    pub numerator: i64,
+    pub denominator: i64,
+}
+impl cudnnResampleMode_t {
+    pub const CUDNN_RESAMPLE_NEAREST: cudnnResampleMode_t = cudnnResampleMode_t(0);
+}
+impl cudnnResampleMode_t {
+    pub const CUDNN_RESAMPLE_BILINEAR: cudnnResampleMode_t = cudnnResampleMode_t(1);
+}
+impl cudnnResampleMode_t {
+    pub const CUDNN_RESAMPLE_AVGPOOL: cudnnResampleMode_t = cudnnResampleMode_t(2);
+}
+impl cudnnResampleMode_t {
+    pub const CUDNN_RESAMPLE_AVGPOOL_INCLUDE_PADDING: cudnnResampleMode_t = cudnnResampleMode_t(
+        2,
+    );
+}
+impl cudnnResampleMode_t {
+    pub const CUDNN_RESAMPLE_AVGPOOL_EXCLUDE_PADDING: cudnnResampleMode_t = cudnnResampleMode_t(
+        4,
+    );
+}
+impl cudnnResampleMode_t {
+    pub const CUDNN_RESAMPLE_MAXPOOL: cudnnResampleMode_t = cudnnResampleMode_t(3);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnResampleMode_t(pub ::core::ffi::c_uint);
+impl cudnnSignalMode_t {
+    pub const CUDNN_SIGNAL_SET: cudnnSignalMode_t = cudnnSignalMode_t(0);
+}
+impl cudnnSignalMode_t {
+    pub const CUDNN_SIGNAL_WAIT: cudnnSignalMode_t = cudnnSignalMode_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnSignalMode_t(pub ::core::ffi::c_uint);
+impl cudnnGenStatsMode_t {
+    pub const CUDNN_GENSTATS_SUM_SQSUM: cudnnGenStatsMode_t = cudnnGenStatsMode_t(0);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnGenStatsMode_t(pub ::core::ffi::c_uint);
+impl cudnnBnFinalizeStatsMode_t {
+    pub const CUDNN_BN_FINALIZE_STATISTICS_TRAINING: cudnnBnFinalizeStatsMode_t = cudnnBnFinalizeStatsMode_t(
+        0,
+    );
+}
+impl cudnnBnFinalizeStatsMode_t {
+    pub const CUDNN_BN_FINALIZE_STATISTICS_INFERENCE: cudnnBnFinalizeStatsMode_t = cudnnBnFinalizeStatsMode_t(
+        1,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnBnFinalizeStatsMode_t(pub ::core::ffi::c_uint);
+impl cudnnRngDistribution_t {
+    pub const CUDNN_RNG_DISTRIBUTION_BERNOULLI: cudnnRngDistribution_t = cudnnRngDistribution_t(
+        0,
+    );
+}
+impl cudnnRngDistribution_t {
+    pub const CUDNN_RNG_DISTRIBUTION_UNIFORM: cudnnRngDistribution_t = cudnnRngDistribution_t(
+        1,
+    );
+}
+impl cudnnRngDistribution_t {
+    pub const CUDNN_RNG_DISTRIBUTION_NORMAL: cudnnRngDistribution_t = cudnnRngDistribution_t(
+        2,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnRngDistribution_t(pub ::core::ffi::c_uint);
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_HANDLE: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        0,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_DATA_TYPE: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        1,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_BOOLEAN: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        2,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_INT64: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        3,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_FLOAT: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        4,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_DOUBLE: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        5,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_VOID_PTR: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        6,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_CONVOLUTION_MODE: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        7,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_HEUR_MODE: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        8,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_KNOB_TYPE: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        9,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_NAN_PROPOGATION: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        10,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_NUMERICAL_NOTE: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        11,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_LAYOUT_TYPE: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        12,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_ATTRIB_NAME: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        13,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_POINTWISE_MODE: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        14,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_BACKEND_DESCRIPTOR: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        15,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_GENSTATS_MODE: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        16,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_BN_FINALIZE_STATS_MODE: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        17,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_REDUCTION_OPERATOR_TYPE: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        18,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_BEHAVIOR_NOTE: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        19,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_TENSOR_REORDERING_MODE: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        20,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_RESAMPLE_MODE: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        21,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_PADDING_MODE: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        22,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_INT32: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        23,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_CHAR: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        24,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_SIGNAL_MODE: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        25,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_FRACTION: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        26,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_NORM_MODE: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        27,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_NORM_FWD_PHASE: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        28,
+    );
+}
+impl cudnnBackendAttributeType_t {
+    pub const CUDNN_TYPE_RNG_DISTRIBUTION: cudnnBackendAttributeType_t = cudnnBackendAttributeType_t(
+        29,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnBackendAttributeType_t(pub ::core::ffi::c_uint);
+impl cudnnBackendLayoutType_t {
+    pub const CUDNN_LAYOUT_TYPE_PREFERRED_NCHW: cudnnBackendLayoutType_t = cudnnBackendLayoutType_t(
+        0,
+    );
+}
+impl cudnnBackendLayoutType_t {
+    pub const CUDNN_LAYOUT_TYPE_PREFERRED_NHWC: cudnnBackendLayoutType_t = cudnnBackendLayoutType_t(
+        1,
+    );
+}
+impl cudnnBackendLayoutType_t {
+    pub const CUDNN_LAYOUT_TYPE_PREFERRED_PAD4CK: cudnnBackendLayoutType_t = cudnnBackendLayoutType_t(
+        2,
+    );
+}
+impl cudnnBackendLayoutType_t {
+    pub const CUDNN_LAYOUT_TYPE_PREFERRED_PAD8CK: cudnnBackendLayoutType_t = cudnnBackendLayoutType_t(
+        3,
+    );
+}
+impl cudnnBackendLayoutType_t {
+    pub const CUDNN_LAYOUT_TYPE_COUNT: cudnnBackendLayoutType_t = cudnnBackendLayoutType_t(
+        4,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnBackendLayoutType_t(pub ::core::ffi::c_uint);
+impl cudnnBackendHeurMode_t {
+    pub const CUDNN_HEUR_MODE_INSTANT: cudnnBackendHeurMode_t = cudnnBackendHeurMode_t(
+        0,
+    );
+}
+impl cudnnBackendHeurMode_t {
+    pub const CUDNN_HEUR_MODE_B: cudnnBackendHeurMode_t = cudnnBackendHeurMode_t(1);
+}
+impl cudnnBackendHeurMode_t {
+    pub const CUDNN_HEUR_MODE_FALLBACK: cudnnBackendHeurMode_t = cudnnBackendHeurMode_t(
+        2,
+    );
+}
+impl cudnnBackendHeurMode_t {
+    pub const CUDNN_HEUR_MODE_A: cudnnBackendHeurMode_t = cudnnBackendHeurMode_t(3);
+}
+impl cudnnBackendHeurMode_t {
+    pub const CUDNN_HEUR_MODES_COUNT: cudnnBackendHeurMode_t = cudnnBackendHeurMode_t(4);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnBackendHeurMode_t(pub ::core::ffi::c_uint);
+impl cudnnPaddingMode_t {
+    pub const CUDNN_ZERO_PAD: cudnnPaddingMode_t = cudnnPaddingMode_t(0);
+}
+impl cudnnPaddingMode_t {
+    pub const CUDNN_NEG_INF_PAD: cudnnPaddingMode_t = cudnnPaddingMode_t(1);
+}
+impl cudnnPaddingMode_t {
+    pub const CUDNN_EDGE_VAL_PAD: cudnnPaddingMode_t = cudnnPaddingMode_t(2);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnPaddingMode_t(pub ::core::ffi::c_uint);
+impl cudnnBackendNormFwdPhase_t {
+    pub const CUDNN_NORM_FWD_INFERENCE: cudnnBackendNormFwdPhase_t = cudnnBackendNormFwdPhase_t(
+        0,
+    );
+}
+impl cudnnBackendNormFwdPhase_t {
+    pub const CUDNN_NORM_FWD_TRAINING: cudnnBackendNormFwdPhase_t = cudnnBackendNormFwdPhase_t(
+        1,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnBackendNormFwdPhase_t(pub ::core::ffi::c_uint);
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnTensorStruct {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnPoolingStruct {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnFilterStruct {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnLRNStruct {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnActivationStruct {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnSpatialTransformerStruct {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnOpTensorStruct {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnReduceTensorStruct {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnCTCLossStruct {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnTensorTransformStruct {
+    _unused: [u8; 0],
+}
+impl cudnnDeterminism_t {
+    pub const CUDNN_NON_DETERMINISTIC: cudnnDeterminism_t = cudnnDeterminism_t(0);
+}
+impl cudnnDeterminism_t {
+    pub const CUDNN_DETERMINISTIC: cudnnDeterminism_t = cudnnDeterminism_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnDeterminism_t(pub ::core::ffi::c_uint);
+impl cudnnFoldingDirection_t {
+    pub const CUDNN_TRANSFORM_FOLD: cudnnFoldingDirection_t = cudnnFoldingDirection_t(0);
+}
+impl cudnnFoldingDirection_t {
+    pub const CUDNN_TRANSFORM_UNFOLD: cudnnFoldingDirection_t = cudnnFoldingDirection_t(
+        1,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnFoldingDirection_t(pub ::core::ffi::c_uint);
+impl cudnnOpTensorOp_t {
+    pub const CUDNN_OP_TENSOR_ADD: cudnnOpTensorOp_t = cudnnOpTensorOp_t(0);
+}
+impl cudnnOpTensorOp_t {
+    pub const CUDNN_OP_TENSOR_MUL: cudnnOpTensorOp_t = cudnnOpTensorOp_t(1);
+}
+impl cudnnOpTensorOp_t {
+    pub const CUDNN_OP_TENSOR_MIN: cudnnOpTensorOp_t = cudnnOpTensorOp_t(2);
+}
+impl cudnnOpTensorOp_t {
+    pub const CUDNN_OP_TENSOR_MAX: cudnnOpTensorOp_t = cudnnOpTensorOp_t(3);
+}
+impl cudnnOpTensorOp_t {
+    pub const CUDNN_OP_TENSOR_SQRT: cudnnOpTensorOp_t = cudnnOpTensorOp_t(4);
+}
+impl cudnnOpTensorOp_t {
+    pub const CUDNN_OP_TENSOR_NOT: cudnnOpTensorOp_t = cudnnOpTensorOp_t(5);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnOpTensorOp_t(pub ::core::ffi::c_uint);
+impl cudnnReduceTensorIndices_t {
+    pub const CUDNN_REDUCE_TENSOR_NO_INDICES: cudnnReduceTensorIndices_t = cudnnReduceTensorIndices_t(
+        0,
+    );
+}
+impl cudnnReduceTensorIndices_t {
+    pub const CUDNN_REDUCE_TENSOR_FLATTENED_INDICES: cudnnReduceTensorIndices_t = cudnnReduceTensorIndices_t(
+        1,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnReduceTensorIndices_t(pub ::core::ffi::c_uint);
+impl cudnnIndicesType_t {
+    pub const CUDNN_32BIT_INDICES: cudnnIndicesType_t = cudnnIndicesType_t(0);
+}
+impl cudnnIndicesType_t {
+    pub const CUDNN_64BIT_INDICES: cudnnIndicesType_t = cudnnIndicesType_t(1);
+}
+impl cudnnIndicesType_t {
+    pub const CUDNN_16BIT_INDICES: cudnnIndicesType_t = cudnnIndicesType_t(2);
+}
+impl cudnnIndicesType_t {
+    pub const CUDNN_8BIT_INDICES: cudnnIndicesType_t = cudnnIndicesType_t(3);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnIndicesType_t(pub ::core::ffi::c_uint);
+impl cudnnSoftmaxAlgorithm_t {
+    pub const CUDNN_SOFTMAX_FAST: cudnnSoftmaxAlgorithm_t = cudnnSoftmaxAlgorithm_t(0);
+}
+impl cudnnSoftmaxAlgorithm_t {
+    pub const CUDNN_SOFTMAX_ACCURATE: cudnnSoftmaxAlgorithm_t = cudnnSoftmaxAlgorithm_t(
+        1,
+    );
+}
+impl cudnnSoftmaxAlgorithm_t {
+    pub const CUDNN_SOFTMAX_LOG: cudnnSoftmaxAlgorithm_t = cudnnSoftmaxAlgorithm_t(2);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnSoftmaxAlgorithm_t(pub ::core::ffi::c_uint);
+impl cudnnSoftmaxMode_t {
+    pub const CUDNN_SOFTMAX_MODE_INSTANCE: cudnnSoftmaxMode_t = cudnnSoftmaxMode_t(0);
+}
+impl cudnnSoftmaxMode_t {
+    pub const CUDNN_SOFTMAX_MODE_CHANNEL: cudnnSoftmaxMode_t = cudnnSoftmaxMode_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnSoftmaxMode_t(pub ::core::ffi::c_uint);
+impl cudnnPoolingMode_t {
+    pub const CUDNN_POOLING_MAX: cudnnPoolingMode_t = cudnnPoolingMode_t(0);
+}
+impl cudnnPoolingMode_t {
+    pub const CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING: cudnnPoolingMode_t = cudnnPoolingMode_t(
+        1,
+    );
+}
+impl cudnnPoolingMode_t {
+    pub const CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING: cudnnPoolingMode_t = cudnnPoolingMode_t(
+        2,
+    );
+}
+impl cudnnPoolingMode_t {
+    pub const CUDNN_POOLING_MAX_DETERMINISTIC: cudnnPoolingMode_t = cudnnPoolingMode_t(
+        3,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnPoolingMode_t(pub ::core::ffi::c_uint);
+impl cudnnLRNMode_t {
+    pub const CUDNN_LRN_CROSS_CHANNEL_DIM1: cudnnLRNMode_t = cudnnLRNMode_t(0);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnLRNMode_t(pub ::core::ffi::c_uint);
+impl cudnnDivNormMode_t {
+    pub const CUDNN_DIVNORM_PRECOMPUTED_MEANS: cudnnDivNormMode_t = cudnnDivNormMode_t(
+        0,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnDivNormMode_t(pub ::core::ffi::c_uint);
+impl cudnnBatchNormMode_t {
+    pub const CUDNN_BATCHNORM_PER_ACTIVATION: cudnnBatchNormMode_t = cudnnBatchNormMode_t(
+        0,
+    );
+}
+impl cudnnBatchNormMode_t {
+    pub const CUDNN_BATCHNORM_SPATIAL: cudnnBatchNormMode_t = cudnnBatchNormMode_t(1);
+}
+impl cudnnBatchNormMode_t {
+    pub const CUDNN_BATCHNORM_SPATIAL_PERSISTENT: cudnnBatchNormMode_t = cudnnBatchNormMode_t(
+        2,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnBatchNormMode_t(pub ::core::ffi::c_uint);
+impl cudnnBatchNormOps_t {
+    pub const CUDNN_BATCHNORM_OPS_BN: cudnnBatchNormOps_t = cudnnBatchNormOps_t(0);
+}
+impl cudnnBatchNormOps_t {
+    pub const CUDNN_BATCHNORM_OPS_BN_ACTIVATION: cudnnBatchNormOps_t = cudnnBatchNormOps_t(
+        1,
+    );
+}
+impl cudnnBatchNormOps_t {
+    pub const CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION: cudnnBatchNormOps_t = cudnnBatchNormOps_t(
+        2,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnBatchNormOps_t(pub ::core::ffi::c_uint);
+impl cudnnNormMode_t {
+    pub const CUDNN_NORM_PER_ACTIVATION: cudnnNormMode_t = cudnnNormMode_t(0);
+}
+impl cudnnNormMode_t {
+    pub const CUDNN_NORM_PER_CHANNEL: cudnnNormMode_t = cudnnNormMode_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnNormMode_t(pub ::core::ffi::c_uint);
+impl cudnnNormAlgo_t {
+    pub const CUDNN_NORM_ALGO_STANDARD: cudnnNormAlgo_t = cudnnNormAlgo_t(0);
+}
+impl cudnnNormAlgo_t {
+    pub const CUDNN_NORM_ALGO_PERSIST: cudnnNormAlgo_t = cudnnNormAlgo_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnNormAlgo_t(pub ::core::ffi::c_uint);
+impl cudnnNormOps_t {
+    pub const CUDNN_NORM_OPS_NORM: cudnnNormOps_t = cudnnNormOps_t(0);
+}
+impl cudnnNormOps_t {
+    pub const CUDNN_NORM_OPS_NORM_ACTIVATION: cudnnNormOps_t = cudnnNormOps_t(1);
+}
+impl cudnnNormOps_t {
+    pub const CUDNN_NORM_OPS_NORM_ADD_ACTIVATION: cudnnNormOps_t = cudnnNormOps_t(2);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnNormOps_t(pub ::core::ffi::c_uint);
+impl cudnnSamplerType_t {
+    pub const CUDNN_SAMPLER_BILINEAR: cudnnSamplerType_t = cudnnSamplerType_t(0);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnSamplerType_t(pub ::core::ffi::c_uint);
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnDropoutStruct {
+    _unused: [u8; 0],
+}
+impl cudnnConvolutionFwdAlgo_t {
+    pub const CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM: cudnnConvolutionFwdAlgo_t = cudnnConvolutionFwdAlgo_t(
+        0,
+    );
+}
+impl cudnnConvolutionFwdAlgo_t {
+    pub const CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM: cudnnConvolutionFwdAlgo_t = cudnnConvolutionFwdAlgo_t(
+        1,
+    );
+}
+impl cudnnConvolutionFwdAlgo_t {
+    pub const CUDNN_CONVOLUTION_FWD_ALGO_GEMM: cudnnConvolutionFwdAlgo_t = cudnnConvolutionFwdAlgo_t(
+        2,
+    );
+}
+impl cudnnConvolutionFwdAlgo_t {
+    pub const CUDNN_CONVOLUTION_FWD_ALGO_DIRECT: cudnnConvolutionFwdAlgo_t = cudnnConvolutionFwdAlgo_t(
+        3,
+    );
+}
+impl cudnnConvolutionFwdAlgo_t {
+    pub const CUDNN_CONVOLUTION_FWD_ALGO_FFT: cudnnConvolutionFwdAlgo_t = cudnnConvolutionFwdAlgo_t(
+        4,
+    );
+}
+impl cudnnConvolutionFwdAlgo_t {
+    pub const CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING: cudnnConvolutionFwdAlgo_t = cudnnConvolutionFwdAlgo_t(
+        5,
+    );
+}
+impl cudnnConvolutionFwdAlgo_t {
+    pub const CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD: cudnnConvolutionFwdAlgo_t = cudnnConvolutionFwdAlgo_t(
+        6,
+    );
+}
+impl cudnnConvolutionFwdAlgo_t {
+    pub const CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED: cudnnConvolutionFwdAlgo_t = cudnnConvolutionFwdAlgo_t(
+        7,
+    );
+}
+impl cudnnConvolutionFwdAlgo_t {
+    pub const CUDNN_CONVOLUTION_FWD_ALGO_COUNT: cudnnConvolutionFwdAlgo_t = cudnnConvolutionFwdAlgo_t(
+        8,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnConvolutionFwdAlgo_t(pub ::core::ffi::c_uint);
+impl cudnnConvolutionBwdFilterAlgo_t {
+    pub const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0: cudnnConvolutionBwdFilterAlgo_t = cudnnConvolutionBwdFilterAlgo_t(
+        0,
+    );
+}
+impl cudnnConvolutionBwdFilterAlgo_t {
+    pub const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1: cudnnConvolutionBwdFilterAlgo_t = cudnnConvolutionBwdFilterAlgo_t(
+        1,
+    );
+}
+impl cudnnConvolutionBwdFilterAlgo_t {
+    pub const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT: cudnnConvolutionBwdFilterAlgo_t = cudnnConvolutionBwdFilterAlgo_t(
+        2,
+    );
+}
+impl cudnnConvolutionBwdFilterAlgo_t {
+    pub const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3: cudnnConvolutionBwdFilterAlgo_t = cudnnConvolutionBwdFilterAlgo_t(
+        3,
+    );
+}
+impl cudnnConvolutionBwdFilterAlgo_t {
+    pub const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD: cudnnConvolutionBwdFilterAlgo_t = cudnnConvolutionBwdFilterAlgo_t(
+        4,
+    );
+}
+impl cudnnConvolutionBwdFilterAlgo_t {
+    pub const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED: cudnnConvolutionBwdFilterAlgo_t = cudnnConvolutionBwdFilterAlgo_t(
+        5,
+    );
+}
+impl cudnnConvolutionBwdFilterAlgo_t {
+    pub const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING: cudnnConvolutionBwdFilterAlgo_t = cudnnConvolutionBwdFilterAlgo_t(
+        6,
+    );
+}
+impl cudnnConvolutionBwdFilterAlgo_t {
+    pub const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT: cudnnConvolutionBwdFilterAlgo_t = cudnnConvolutionBwdFilterAlgo_t(
+        7,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnConvolutionBwdFilterAlgo_t(pub ::core::ffi::c_uint);
+impl cudnnConvolutionBwdDataAlgo_t {
+    pub const CUDNN_CONVOLUTION_BWD_DATA_ALGO_0: cudnnConvolutionBwdDataAlgo_t = cudnnConvolutionBwdDataAlgo_t(
+        0,
+    );
+}
+impl cudnnConvolutionBwdDataAlgo_t {
+    pub const CUDNN_CONVOLUTION_BWD_DATA_ALGO_1: cudnnConvolutionBwdDataAlgo_t = cudnnConvolutionBwdDataAlgo_t(
+        1,
+    );
+}
+impl cudnnConvolutionBwdDataAlgo_t {
+    pub const CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT: cudnnConvolutionBwdDataAlgo_t = cudnnConvolutionBwdDataAlgo_t(
+        2,
+    );
+}
+impl cudnnConvolutionBwdDataAlgo_t {
+    pub const CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING: cudnnConvolutionBwdDataAlgo_t = cudnnConvolutionBwdDataAlgo_t(
+        3,
+    );
+}
+impl cudnnConvolutionBwdDataAlgo_t {
+    pub const CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD: cudnnConvolutionBwdDataAlgo_t = cudnnConvolutionBwdDataAlgo_t(
+        4,
+    );
+}
+impl cudnnConvolutionBwdDataAlgo_t {
+    pub const CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED: cudnnConvolutionBwdDataAlgo_t = cudnnConvolutionBwdDataAlgo_t(
+        5,
+    );
+}
+impl cudnnConvolutionBwdDataAlgo_t {
+    pub const CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT: cudnnConvolutionBwdDataAlgo_t = cudnnConvolutionBwdDataAlgo_t(
+        6,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnConvolutionBwdDataAlgo_t(pub ::core::ffi::c_uint);
+impl cudnnCTCLossAlgo_t {
+    pub const CUDNN_CTC_LOSS_ALGO_DETERMINISTIC: cudnnCTCLossAlgo_t = cudnnCTCLossAlgo_t(
+        0,
+    );
+}
+impl cudnnCTCLossAlgo_t {
+    pub const CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC: cudnnCTCLossAlgo_t = cudnnCTCLossAlgo_t(
+        1,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnCTCLossAlgo_t(pub ::core::ffi::c_uint);
+impl cudnnRNNAlgo_t {
+    pub const CUDNN_RNN_ALGO_STANDARD: cudnnRNNAlgo_t = cudnnRNNAlgo_t(0);
+}
+impl cudnnRNNAlgo_t {
+    pub const CUDNN_RNN_ALGO_PERSIST_STATIC: cudnnRNNAlgo_t = cudnnRNNAlgo_t(1);
+}
+impl cudnnRNNAlgo_t {
+    pub const CUDNN_RNN_ALGO_PERSIST_DYNAMIC: cudnnRNNAlgo_t = cudnnRNNAlgo_t(2);
+}
+impl cudnnRNNAlgo_t {
+    pub const CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H: cudnnRNNAlgo_t = cudnnRNNAlgo_t(3);
+}
+impl cudnnRNNAlgo_t {
+    pub const CUDNN_RNN_ALGO_COUNT: cudnnRNNAlgo_t = cudnnRNNAlgo_t(4);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnRNNAlgo_t(pub ::core::ffi::c_uint);
+impl cudnnForwardMode_t {
+    pub const CUDNN_FWD_MODE_INFERENCE: cudnnForwardMode_t = cudnnForwardMode_t(0);
+}
+impl cudnnForwardMode_t {
+    pub const CUDNN_FWD_MODE_TRAINING: cudnnForwardMode_t = cudnnForwardMode_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnForwardMode_t(pub ::core::ffi::c_uint);
+impl cudnnRNNMode_t {
+    pub const CUDNN_RNN_RELU: cudnnRNNMode_t = cudnnRNNMode_t(0);
+}
+impl cudnnRNNMode_t {
+    pub const CUDNN_RNN_TANH: cudnnRNNMode_t = cudnnRNNMode_t(1);
+}
+impl cudnnRNNMode_t {
+    pub const CUDNN_LSTM: cudnnRNNMode_t = cudnnRNNMode_t(2);
+}
+impl cudnnRNNMode_t {
+    pub const CUDNN_GRU: cudnnRNNMode_t = cudnnRNNMode_t(3);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnRNNMode_t(pub ::core::ffi::c_uint);
+impl cudnnRNNBiasMode_t {
+    pub const CUDNN_RNN_NO_BIAS: cudnnRNNBiasMode_t = cudnnRNNBiasMode_t(0);
+}
+impl cudnnRNNBiasMode_t {
+    pub const CUDNN_RNN_SINGLE_INP_BIAS: cudnnRNNBiasMode_t = cudnnRNNBiasMode_t(1);
+}
+impl cudnnRNNBiasMode_t {
+    pub const CUDNN_RNN_DOUBLE_BIAS: cudnnRNNBiasMode_t = cudnnRNNBiasMode_t(2);
+}
+impl cudnnRNNBiasMode_t {
+    pub const CUDNN_RNN_SINGLE_REC_BIAS: cudnnRNNBiasMode_t = cudnnRNNBiasMode_t(3);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnRNNBiasMode_t(pub ::core::ffi::c_uint);
+impl cudnnDirectionMode_t {
+    pub const CUDNN_UNIDIRECTIONAL: cudnnDirectionMode_t = cudnnDirectionMode_t(0);
+}
+impl cudnnDirectionMode_t {
+    pub const CUDNN_BIDIRECTIONAL: cudnnDirectionMode_t = cudnnDirectionMode_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnDirectionMode_t(pub ::core::ffi::c_uint);
+impl cudnnRNNInputMode_t {
+    pub const CUDNN_LINEAR_INPUT: cudnnRNNInputMode_t = cudnnRNNInputMode_t(0);
+}
+impl cudnnRNNInputMode_t {
+    pub const CUDNN_SKIP_INPUT: cudnnRNNInputMode_t = cudnnRNNInputMode_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnRNNInputMode_t(pub ::core::ffi::c_uint);
+impl cudnnRNNClipMode_t {
+    pub const CUDNN_RNN_CLIP_NONE: cudnnRNNClipMode_t = cudnnRNNClipMode_t(0);
+}
+impl cudnnRNNClipMode_t {
+    pub const CUDNN_RNN_CLIP_MINMAX: cudnnRNNClipMode_t = cudnnRNNClipMode_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnRNNClipMode_t(pub ::core::ffi::c_uint);
+impl cudnnRNNDataLayout_t {
+    pub const CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED: cudnnRNNDataLayout_t = cudnnRNNDataLayout_t(
+        0,
+    );
+}
+impl cudnnRNNDataLayout_t {
+    pub const CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED: cudnnRNNDataLayout_t = cudnnRNNDataLayout_t(
+        1,
+    );
+}
+impl cudnnRNNDataLayout_t {
+    pub const CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED: cudnnRNNDataLayout_t = cudnnRNNDataLayout_t(
+        2,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnRNNDataLayout_t(pub ::core::ffi::c_uint);
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnRNNStruct {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnRNNDataStruct {
+    _unused: [u8; 0],
+}
+impl cudnnSeqDataAxis_t {
+    pub const CUDNN_SEQDATA_TIME_DIM: cudnnSeqDataAxis_t = cudnnSeqDataAxis_t(0);
+}
+impl cudnnSeqDataAxis_t {
+    pub const CUDNN_SEQDATA_BATCH_DIM: cudnnSeqDataAxis_t = cudnnSeqDataAxis_t(1);
+}
+impl cudnnSeqDataAxis_t {
+    pub const CUDNN_SEQDATA_BEAM_DIM: cudnnSeqDataAxis_t = cudnnSeqDataAxis_t(2);
+}
+impl cudnnSeqDataAxis_t {
+    pub const CUDNN_SEQDATA_VECT_DIM: cudnnSeqDataAxis_t = cudnnSeqDataAxis_t(3);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnSeqDataAxis_t(pub ::core::ffi::c_uint);
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnSeqDataStruct {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnAttnStruct {
+    _unused: [u8; 0],
+}
+impl cudnnMultiHeadAttnWeightKind_t {
+    pub const CUDNN_MH_ATTN_Q_WEIGHTS: cudnnMultiHeadAttnWeightKind_t = cudnnMultiHeadAttnWeightKind_t(
+        0,
+    );
+}
+impl cudnnMultiHeadAttnWeightKind_t {
+    pub const CUDNN_MH_ATTN_K_WEIGHTS: cudnnMultiHeadAttnWeightKind_t = cudnnMultiHeadAttnWeightKind_t(
+        1,
+    );
+}
+impl cudnnMultiHeadAttnWeightKind_t {
+    pub const CUDNN_MH_ATTN_V_WEIGHTS: cudnnMultiHeadAttnWeightKind_t = cudnnMultiHeadAttnWeightKind_t(
+        2,
+    );
+}
+impl cudnnMultiHeadAttnWeightKind_t {
+    pub const CUDNN_MH_ATTN_O_WEIGHTS: cudnnMultiHeadAttnWeightKind_t = cudnnMultiHeadAttnWeightKind_t(
+        3,
+    );
+}
+impl cudnnMultiHeadAttnWeightKind_t {
+    pub const CUDNN_MH_ATTN_Q_BIASES: cudnnMultiHeadAttnWeightKind_t = cudnnMultiHeadAttnWeightKind_t(
+        4,
+    );
+}
+impl cudnnMultiHeadAttnWeightKind_t {
+    pub const CUDNN_MH_ATTN_K_BIASES: cudnnMultiHeadAttnWeightKind_t = cudnnMultiHeadAttnWeightKind_t(
+        5,
+    );
+}
+impl cudnnMultiHeadAttnWeightKind_t {
+    pub const CUDNN_MH_ATTN_V_BIASES: cudnnMultiHeadAttnWeightKind_t = cudnnMultiHeadAttnWeightKind_t(
+        6,
+    );
+}
+impl cudnnMultiHeadAttnWeightKind_t {
+    pub const CUDNN_MH_ATTN_O_BIASES: cudnnMultiHeadAttnWeightKind_t = cudnnMultiHeadAttnWeightKind_t(
+        7,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnMultiHeadAttnWeightKind_t(pub ::core::ffi::c_uint);
+impl cudnnWgradMode_t {
+    pub const CUDNN_WGRAD_MODE_ADD: cudnnWgradMode_t = cudnnWgradMode_t(0);
+}
+impl cudnnWgradMode_t {
+    pub const CUDNN_WGRAD_MODE_SET: cudnnWgradMode_t = cudnnWgradMode_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnWgradMode_t(pub ::core::ffi::c_uint);
+impl cudnnLossNormalizationMode_t {
+    pub const CUDNN_LOSS_NORMALIZATION_NONE: cudnnLossNormalizationMode_t = cudnnLossNormalizationMode_t(
+        0,
+    );
+}
+impl cudnnLossNormalizationMode_t {
+    pub const CUDNN_LOSS_NORMALIZATION_SOFTMAX: cudnnLossNormalizationMode_t = cudnnLossNormalizationMode_t(
+        1,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnLossNormalizationMode_t(pub ::core::ffi::c_uint);
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnConvolutionStruct {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnFusedOpsConstParamStruct {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnFusedOpsVariantParamStruct {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnFusedOpsPlanStruct {
+    _unused: [u8; 0],
+}
+impl cudnnFusedOps_t {
+    pub const CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS: cudnnFusedOps_t = cudnnFusedOps_t(
+        0,
+    );
+}
+impl cudnnFusedOps_t {
+    pub const CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD: cudnnFusedOps_t = cudnnFusedOps_t(
+        1,
+    );
+}
+impl cudnnFusedOps_t {
+    pub const CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING: cudnnFusedOps_t = cudnnFusedOps_t(
+        2,
+    );
+}
+impl cudnnFusedOps_t {
+    pub const CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE: cudnnFusedOps_t = cudnnFusedOps_t(
+        3,
+    );
+}
+impl cudnnFusedOps_t {
+    pub const CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION: cudnnFusedOps_t = cudnnFusedOps_t(
+        4,
+    );
+}
+impl cudnnFusedOps_t {
+    pub const CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK: cudnnFusedOps_t = cudnnFusedOps_t(
+        5,
+    );
+}
+impl cudnnFusedOps_t {
+    pub const CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM: cudnnFusedOps_t = cudnnFusedOps_t(
+        6,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnFusedOps_t(pub ::core::ffi::c_uint);
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_XDESC: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        0,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_XDATA_PLACEHOLDER: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        1,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_BN_MODE: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        2,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_BN_EQSCALEBIAS_DESC: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        3,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        4,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        5,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_ACTIVATION_DESC: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        6,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_CONV_DESC: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        7,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_WDESC: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        8,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_WDATA_PLACEHOLDER: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        9,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_DWDESC: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        10,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_DWDATA_PLACEHOLDER: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        11,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_YDESC: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        12,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_YDATA_PLACEHOLDER: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        13,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_DYDESC: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        14,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_DYDATA_PLACEHOLDER: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        15,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_YSTATS_DESC: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        16,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_YSUM_PLACEHOLDER: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        17,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_YSQSUM_PLACEHOLDER: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        18,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        19,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_BN_SCALE_PLACEHOLDER: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        20,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_BN_BIAS_PLACEHOLDER: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        21,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        22,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        23,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        24,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        25,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_ZDESC: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        26,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_ZDATA_PLACEHOLDER: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        27,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        28,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        29,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        30,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_ACTIVATION_BITMASK_DESC: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        31,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        32,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_DXDESC: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        33,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_DXDATA_PLACEHOLDER: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        34,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_DZDESC: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        35,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_DZDATA_PLACEHOLDER: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        36,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_BN_DSCALE_PLACEHOLDER: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        37,
+    );
+}
+impl cudnnFusedOpsConstParamLabel_t {
+    pub const CUDNN_PARAM_BN_DBIAS_PLACEHOLDER: cudnnFusedOpsConstParamLabel_t = cudnnFusedOpsConstParamLabel_t(
+        38,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnFusedOpsConstParamLabel_t(pub ::core::ffi::c_uint);
+impl cudnnFusedOpsPointerPlaceHolder_t {
+    pub const CUDNN_PTR_NULL: cudnnFusedOpsPointerPlaceHolder_t = cudnnFusedOpsPointerPlaceHolder_t(
+        0,
+    );
+}
+impl cudnnFusedOpsPointerPlaceHolder_t {
+    pub const CUDNN_PTR_ELEM_ALIGNED: cudnnFusedOpsPointerPlaceHolder_t = cudnnFusedOpsPointerPlaceHolder_t(
+        1,
+    );
+}
+impl cudnnFusedOpsPointerPlaceHolder_t {
+    pub const CUDNN_PTR_16B_ALIGNED: cudnnFusedOpsPointerPlaceHolder_t = cudnnFusedOpsPointerPlaceHolder_t(
+        2,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnFusedOpsPointerPlaceHolder_t(pub ::core::ffi::c_uint);
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_XDATA: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        0,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_BN_EQSCALE: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        1,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_BN_EQBIAS: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        2,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_WDATA: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        3,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_DWDATA: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        4,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_YDATA: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        5,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_DYDATA: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        6,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_YSUM: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        7,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_YSQSUM: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        8,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_WORKSPACE: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        9,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_BN_SCALE: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        10,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_BN_BIAS: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        11,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_BN_SAVED_MEAN: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        12,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_BN_SAVED_INVSTD: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        13,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_BN_RUNNING_MEAN: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        14,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_BN_RUNNING_VAR: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        15,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_ZDATA: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        16,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_BN_Z_EQSCALE: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        17,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_BN_Z_EQBIAS: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        18,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_ACTIVATION_BITMASK: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        19,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_DXDATA: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        20,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_DZDATA: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        21,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_BN_DSCALE: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        22,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_PTR_BN_DBIAS: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        23,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        100,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        101,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        102,
+    );
+}
+impl cudnnFusedOpsVariantParamLabel_t {
+    pub const CUDNN_SCALAR_DOUBLE_BN_EPSILON: cudnnFusedOpsVariantParamLabel_t = cudnnFusedOpsVariantParamLabel_t(
+        103,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnFusedOpsVariantParamLabel_t(pub ::core::ffi::c_uint);
diff --git a/cuda_types/src/cudnn8.rs b/cuda_types/src/cudnn8.rs
new file mode 100644
index 0000000..c41a918
--- /dev/null
+++ b/cuda_types/src/cudnn8.rs
@@ -0,0 +1,576 @@
+// Generated automatically by zluda_bindgen
+// DO NOT EDIT MANUALLY
+#![allow(warnings)]
+pub type __half = u16;
+pub type __nv_bfloat16 = u16;
+pub use super::cuda::cuComplex;
+pub use super::cuda::cuDoubleComplex;
+pub use super::cuda::cudaDataType;
+pub use super::cuda::cudaDataType_t;
+pub type cudaStream_t = super::cuda::CUstream;
+pub use super::cuda::libraryPropertyType;
+pub type cudaGraphExecUpdateResultInfo_st = super::cuda::CUgraphExecUpdateResultInfo_st;
+pub type cudaAsyncNotificationType = super::cuda::CUasyncNotificationType_enum;
+pub type cudaGraph_t = super::cuda::CUgraph;
+pub const CUDNN_MAJOR: u32 = 8;
+pub const CUDNN_MINOR: u32 = 9;
+pub const CUDNN_PATCHLEVEL: u32 = 7;
+pub const CUDNN_VERSION: u32 = 8907;
+pub const CUDNN_MAX_SM_MAJOR_NUMBER: u32 = 9;
+pub const CUDNN_MAX_SM_MINOR_NUMBER: u32 = 0;
+pub const CUDNN_MAX_DEVICE_VERSION: u32 = 900;
+pub const CUDNN_SM_50: u32 = 500;
+pub const CUDNN_SM_52: u32 = 520;
+pub const CUDNN_SM_53: u32 = 530;
+pub const CUDNN_SM_60: u32 = 600;
+pub const CUDNN_SM_61: u32 = 610;
+pub const CUDNN_SM_62: u32 = 620;
+pub const CUDNN_SM_70: u32 = 700;
+pub const CUDNN_SM_72: u32 = 720;
+pub const CUDNN_SM_75: u32 = 750;
+pub const CUDNN_SM_80: u32 = 800;
+pub const CUDNN_SM_86: u32 = 860;
+pub const CUDNN_SM_87: u32 = 870;
+pub const CUDNN_SM_89: u32 = 890;
+pub const CUDNN_SM_90: u32 = 900;
+pub const CUDNN_SM_9X_END: u32 = 999;
+pub const CUDNN_MIN_DEVICE_VERSION: u32 = 500;
+pub const CUDNN_OPS_INFER_MAJOR: u32 = 8;
+pub const CUDNN_OPS_INFER_MINOR: u32 = 9;
+pub const CUDNN_OPS_INFER_PATCH: u32 = 7;
+pub const CUDNN_DIM_MAX: u32 = 8;
+pub const CUDNN_LRN_MIN_N: u32 = 1;
+pub const CUDNN_LRN_MAX_N: u32 = 16;
+pub const CUDNN_LRN_MIN_K: f64 = 0.00001;
+pub const CUDNN_LRN_MIN_BETA: f64 = 0.01;
+pub const CUDNN_BN_MIN_EPSILON: f64 = 0.0;
+pub const CUDNN_OPS_TRAIN_MAJOR: u32 = 8;
+pub const CUDNN_OPS_TRAIN_MINOR: u32 = 9;
+pub const CUDNN_OPS_TRAIN_PATCH: u32 = 7;
+pub const CUDNN_ADV_INFER_MAJOR: u32 = 8;
+pub const CUDNN_ADV_INFER_MINOR: u32 = 9;
+pub const CUDNN_ADV_INFER_PATCH: u32 = 7;
+pub const CUDNN_RNN_PADDED_IO_DISABLED: u32 = 0;
+pub const CUDNN_RNN_PADDED_IO_ENABLED: u32 = 1;
+pub const CUDNN_SEQDATA_DIM_COUNT: u32 = 4;
+pub const CUDNN_ATTN_QUERYMAP_ALL_TO_ONE: u32 = 0;
+pub const CUDNN_ATTN_QUERYMAP_ONE_TO_ONE: u32 = 1;
+pub const CUDNN_ATTN_DISABLE_PROJ_BIASES: u32 = 0;
+pub const CUDNN_ATTN_ENABLE_PROJ_BIASES: u32 = 2;
+pub const CUDNN_ATTN_WKIND_COUNT: u32 = 8;
+pub const CUDNN_ADV_TRAIN_MAJOR: u32 = 8;
+pub const CUDNN_ADV_TRAIN_MINOR: u32 = 9;
+pub const CUDNN_ADV_TRAIN_PATCH: u32 = 7;
+pub const CUDNN_CNN_INFER_MAJOR: u32 = 8;
+pub const CUDNN_CNN_INFER_MINOR: u32 = 9;
+pub const CUDNN_CNN_INFER_PATCH: u32 = 7;
+pub const CUDNN_CNN_TRAIN_MAJOR: u32 = 8;
+pub const CUDNN_CNN_TRAIN_MINOR: u32 = 9;
+pub const CUDNN_CNN_TRAIN_PATCH: u32 = 7;
+pub use super::cudnn::cudnnContext;
+pub type cudnnHandle_t = *mut cudnnContext;
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_SUCCESS: cudnnStatus_t = cudnnStatus_t(0);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_NOT_INITIALIZED: cudnnStatus_t = cudnnStatus_t(1);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_ALLOC_FAILED: cudnnStatus_t = cudnnStatus_t(2);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_BAD_PARAM: cudnnStatus_t = cudnnStatus_t(3);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_INTERNAL_ERROR: cudnnStatus_t = cudnnStatus_t(4);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_INVALID_VALUE: cudnnStatus_t = cudnnStatus_t(5);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_ARCH_MISMATCH: cudnnStatus_t = cudnnStatus_t(6);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_MAPPING_ERROR: cudnnStatus_t = cudnnStatus_t(7);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_EXECUTION_FAILED: cudnnStatus_t = cudnnStatus_t(8);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_NOT_SUPPORTED: cudnnStatus_t = cudnnStatus_t(9);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_LICENSE_ERROR: cudnnStatus_t = cudnnStatus_t(10);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING: cudnnStatus_t = cudnnStatus_t(
+        11,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_RUNTIME_IN_PROGRESS: cudnnStatus_t = cudnnStatus_t(12);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_RUNTIME_FP_OVERFLOW: cudnnStatus_t = cudnnStatus_t(13);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_VERSION_MISMATCH: cudnnStatus_t = cudnnStatus_t(14);
+}
+#[repr(transparent)]
+#[must_use]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnStatus_t(pub ::core::ffi::c_uint);
+pub use super::cudnn::cudnnRuntimeTag_t;
+pub use super::cudnn::cudnnErrQueryMode_t;
+pub use super::cudnn::cudnnTensorStruct;
+pub type cudnnTensorDescriptor_t = *mut cudnnTensorStruct;
+pub use super::cudnn::cudnnPoolingStruct;
+pub type cudnnPoolingDescriptor_t = *mut cudnnPoolingStruct;
+pub use super::cudnn::cudnnFilterStruct;
+pub type cudnnFilterDescriptor_t = *mut cudnnFilterStruct;
+pub use super::cudnn::cudnnLRNStruct;
+pub type cudnnLRNDescriptor_t = *mut cudnnLRNStruct;
+pub use super::cudnn::cudnnActivationStruct;
+pub type cudnnActivationDescriptor_t = *mut cudnnActivationStruct;
+pub use super::cudnn::cudnnSpatialTransformerStruct;
+pub type cudnnSpatialTransformerDescriptor_t = *mut cudnnSpatialTransformerStruct;
+pub use super::cudnn::cudnnOpTensorStruct;
+pub type cudnnOpTensorDescriptor_t = *mut cudnnOpTensorStruct;
+pub use super::cudnn::cudnnReduceTensorStruct;
+pub type cudnnReduceTensorDescriptor_t = *mut cudnnReduceTensorStruct;
+pub use super::cudnn::cudnnCTCLossStruct;
+pub type cudnnCTCLossDescriptor_t = *mut cudnnCTCLossStruct;
+pub use super::cudnn::cudnnTensorTransformStruct;
+pub type cudnnTensorTransformDescriptor_t = *mut cudnnTensorTransformStruct;
+pub use super::cudnn9::cudnnDataType_t;
+pub use super::cudnn::cudnnMathType_t;
+pub use super::cudnn::cudnnNanPropagation_t;
+pub use super::cudnn::cudnnDeterminism_t;
+pub use super::cudnn::cudnnTensorFormat_t;
+pub use super::cudnn::cudnnFoldingDirection_t;
+pub use super::cudnn::cudnnOpTensorOp_t;
+pub use super::cudnn::cudnnReduceTensorOp_t;
+pub use super::cudnn::cudnnReduceTensorIndices_t;
+pub use super::cudnn::cudnnIndicesType_t;
+pub use super::cudnn::cudnnSoftmaxAlgorithm_t;
+pub use super::cudnn::cudnnSoftmaxMode_t;
+pub use super::cudnn::cudnnPoolingMode_t;
+pub use super::cudnn::cudnnActivationMode_t;
+pub use super::cudnn::cudnnLRNMode_t;
+pub use super::cudnn::cudnnDivNormMode_t;
+pub use super::cudnn::cudnnBatchNormMode_t;
+pub use super::cudnn::cudnnBatchNormOps_t;
+pub use super::cudnn::cudnnNormMode_t;
+pub use super::cudnn::cudnnNormAlgo_t;
+pub use super::cudnn::cudnnNormOps_t;
+pub use super::cudnn::cudnnSamplerType_t;
+pub use super::cudnn::cudnnDropoutStruct;
+pub type cudnnDropoutDescriptor_t = *mut cudnnDropoutStruct;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnAlgorithmStruct {
+    _unused: [u8; 0],
+}
+pub type cudnnAlgorithmDescriptor_t = *mut cudnnAlgorithmStruct;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnAlgorithmPerformanceStruct {
+    _unused: [u8; 0],
+}
+pub type cudnnAlgorithmPerformance_t = *mut cudnnAlgorithmPerformanceStruct;
+pub use super::cudnn::cudnnConvolutionFwdAlgo_t;
+pub use super::cudnn::cudnnConvolutionBwdFilterAlgo_t;
+pub use super::cudnn::cudnnConvolutionBwdDataAlgo_t;
+pub use super::cudnn::cudnnRNNAlgo_t;
+pub use super::cudnn::cudnnCTCLossAlgo_t;
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub struct cudnnAlgorithmUnionStruct {
+    pub algo: cudnnAlgorithmUnionStruct_Algorithm,
+}
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub union cudnnAlgorithmUnionStruct_Algorithm {
+    pub convFwdAlgo: cudnnConvolutionFwdAlgo_t,
+    pub convBwdFilterAlgo: cudnnConvolutionBwdFilterAlgo_t,
+    pub convBwdDataAlgo: cudnnConvolutionBwdDataAlgo_t,
+    pub RNNAlgo: cudnnRNNAlgo_t,
+    pub CTCLossAlgo: cudnnCTCLossAlgo_t,
+}
+pub type cudnnAlgorithm_t = cudnnAlgorithmUnionStruct;
+pub use super::cudnn::cudnnSeverity_t;
+#[repr(C)]
+pub struct cudnnDebugStruct {
+    pub cudnn_version: ::core::ffi::c_uint,
+    pub cudnnStatus: cudnnStatus_t,
+    pub time_sec: ::core::ffi::c_uint,
+    pub time_usec: ::core::ffi::c_uint,
+    pub time_delta: ::core::ffi::c_uint,
+    pub handle: cudnnHandle_t,
+    pub stream: cudaStream_t,
+    pub pid: ::core::ffi::c_ulonglong,
+    pub tid: ::core::ffi::c_ulonglong,
+    pub cudaDeviceId: ::core::ffi::c_int,
+    pub reserved: [::core::ffi::c_int; 15usize],
+}
+pub type cudnnDebug_t = cudnnDebugStruct;
+pub type cudnnCallback_t = ::core::option::Option<
+    unsafe extern "C" fn(
+        sev: cudnnSeverity_t,
+        udata: *mut ::core::ffi::c_void,
+        dbg: *const cudnnDebug_t,
+        msg: *const ::core::ffi::c_char,
+    ),
+>;
+pub use super::cudnn::cudnnForwardMode_t;
+pub use super::cudnn::cudnnRNNMode_t;
+pub use super::cudnn::cudnnRNNBiasMode_t;
+pub use super::cudnn::cudnnDirectionMode_t;
+pub use super::cudnn::cudnnRNNInputMode_t;
+pub use super::cudnn::cudnnRNNClipMode_t;
+pub use super::cudnn::cudnnRNNDataLayout_t;
+pub type cudnnRNNPaddingMode_t = ::core::ffi::c_uint;
+pub use super::cudnn::cudnnRNNStruct;
+pub type cudnnRNNDescriptor_t = *mut cudnnRNNStruct;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudnnPersistentRNNPlan {
+    _unused: [u8; 0],
+}
+pub type cudnnPersistentRNNPlan_t = *mut cudnnPersistentRNNPlan;
+pub use super::cudnn::cudnnRNNDataStruct;
+pub type cudnnRNNDataDescriptor_t = *mut cudnnRNNDataStruct;
+pub use super::cudnn::cudnnSeqDataAxis_t;
+pub use super::cudnn::cudnnSeqDataStruct;
+pub type cudnnSeqDataDescriptor_t = *mut cudnnSeqDataStruct;
+pub type cudnnAttnQueryMap_t = ::core::ffi::c_uint;
+pub use super::cudnn::cudnnAttnStruct;
+pub type cudnnAttnDescriptor_t = *mut cudnnAttnStruct;
+pub use super::cudnn::cudnnMultiHeadAttnWeightKind_t;
+pub use super::cudnn::cudnnWgradMode_t;
+pub use super::cudnn::cudnnLossNormalizationMode_t;
+pub use super::cudnn::cudnnConvolutionStruct;
+pub type cudnnConvolutionDescriptor_t = *mut cudnnConvolutionStruct;
+pub use super::cudnn::cudnnConvolutionMode_t;
+pub use super::cudnn::cudnnReorderType_t;
+#[repr(C)]
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub struct cudnnConvolutionFwdAlgoPerfStruct {
+    pub algo: cudnnConvolutionFwdAlgo_t,
+    pub status: cudnnStatus_t,
+    pub time: f32,
+    pub memory: usize,
+    pub determinism: cudnnDeterminism_t,
+    pub mathType: cudnnMathType_t,
+    pub reserved: [::core::ffi::c_int; 3usize],
+}
+pub type cudnnConvolutionFwdAlgoPerf_t = cudnnConvolutionFwdAlgoPerfStruct;
+#[repr(C)]
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub struct cudnnConvolutionBwdDataAlgoPerfStruct {
+    pub algo: cudnnConvolutionBwdDataAlgo_t,
+    pub status: cudnnStatus_t,
+    pub time: f32,
+    pub memory: usize,
+    pub determinism: cudnnDeterminism_t,
+    pub mathType: cudnnMathType_t,
+    pub reserved: [::core::ffi::c_int; 3usize],
+}
+pub type cudnnConvolutionBwdDataAlgoPerf_t = cudnnConvolutionBwdDataAlgoPerfStruct;
+pub use super::cudnn::cudnnFusedOpsConstParamStruct;
+pub type cudnnFusedOpsConstParamPack_t = *mut cudnnFusedOpsConstParamStruct;
+pub use super::cudnn::cudnnFusedOpsVariantParamStruct;
+pub type cudnnFusedOpsVariantParamPack_t = *mut cudnnFusedOpsVariantParamStruct;
+pub use super::cudnn::cudnnFusedOpsPlanStruct;
+pub type cudnnFusedOpsPlan_t = *mut cudnnFusedOpsPlanStruct;
+pub use super::cudnn::cudnnFusedOps_t;
+pub use super::cudnn::cudnnFusedOpsConstParamLabel_t;
+pub use super::cudnn::cudnnFusedOpsPointerPlaceHolder_t;
+pub use super::cudnn::cudnnFusedOpsVariantParamLabel_t;
+#[repr(C)]
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub struct cudnnConvolutionBwdFilterAlgoPerfStruct {
+    pub algo: cudnnConvolutionBwdFilterAlgo_t,
+    pub status: cudnnStatus_t,
+    pub time: f32,
+    pub memory: usize,
+    pub determinism: cudnnDeterminism_t,
+    pub mathType: cudnnMathType_t,
+    pub reserved: [::core::ffi::c_int; 3usize],
+}
+pub type cudnnConvolutionBwdFilterAlgoPerf_t = cudnnConvolutionBwdFilterAlgoPerfStruct;
+pub type cudnnBackendDescriptor_t = *mut ::core::ffi::c_void;
+pub use super::cudnn::cudnnFractionStruct;
+pub type cudnnFraction_t = cudnnFractionStruct;
+pub use super::cudnn9::cudnnPointwiseMode_t;
+pub use super::cudnn::cudnnResampleMode_t;
+pub use super::cudnn::cudnnSignalMode_t;
+pub use super::cudnn::cudnnGenStatsMode_t;
+pub use super::cudnn::cudnnBnFinalizeStatsMode_t;
+pub use super::cudnn::cudnnRngDistribution_t;
+pub use super::cudnn9::cudnnBackendAttributeName_t;
+pub use super::cudnn::cudnnBackendAttributeType_t;
+pub use super::cudnn9::cudnnBackendDescriptorType_t;
+impl cudnnBackendNumericalNote_t {
+    pub const CUDNN_NUMERICAL_NOTE_TENSOR_CORE: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
+        0,
+    );
+}
+impl cudnnBackendNumericalNote_t {
+    pub const CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
+        1,
+    );
+}
+impl cudnnBackendNumericalNote_t {
+    pub const CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
+        2,
+    );
+}
+impl cudnnBackendNumericalNote_t {
+    pub const CUDNN_NUMERICAL_NOTE_FFT: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
+        3,
+    );
+}
+impl cudnnBackendNumericalNote_t {
+    pub const CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
+        4,
+    );
+}
+impl cudnnBackendNumericalNote_t {
+    pub const CUDNN_NUMERICAL_NOTE_WINOGRAD: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
+        5,
+    );
+}
+impl cudnnBackendNumericalNote_t {
+    pub const CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
+        6,
+    );
+}
+impl cudnnBackendNumericalNote_t {
+    pub const CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
+        7,
+    );
+}
+impl cudnnBackendNumericalNote_t {
+    pub const CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
+        8,
+    );
+}
+impl cudnnBackendNumericalNote_t {
+    pub const CUDNN_NUMERICAL_NOTE_TYPE_COUNT: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
+        9,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnBackendNumericalNote_t(pub ::core::ffi::c_uint);
+impl cudnnBackendBehaviorNote_t {
+    pub const CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION: cudnnBackendBehaviorNote_t = cudnnBackendBehaviorNote_t(
+        0,
+    );
+}
+impl cudnnBackendBehaviorNote_t {
+    pub const CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER: cudnnBackendBehaviorNote_t = cudnnBackendBehaviorNote_t(
+        1,
+    );
+}
+impl cudnnBackendBehaviorNote_t {
+    pub const CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER: cudnnBackendBehaviorNote_t = cudnnBackendBehaviorNote_t(
+        2,
+    );
+}
+impl cudnnBackendBehaviorNote_t {
+    pub const CUDNN_BEHAVIOR_NOTE_TYPE_COUNT: cudnnBackendBehaviorNote_t = cudnnBackendBehaviorNote_t(
+        3,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnBackendBehaviorNote_t(pub ::core::ffi::c_uint);
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_SPLIT_K: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        0,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_SWIZZLE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        1,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_TILE_SIZE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        2,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_USE_TEX: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        3,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_EDGE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(4);
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_KBLOCK: cudnnBackendKnobType_t = cudnnBackendKnobType_t(5);
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_LDGA: cudnnBackendKnobType_t = cudnnBackendKnobType_t(6);
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_LDGB: cudnnBackendKnobType_t = cudnnBackendKnobType_t(7);
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_CHUNK_K: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        8,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_SPLIT_H: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        9,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_WINO_TILE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        10,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_MULTIPLY: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        11,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_SPLIT_K_BUF: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        12,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_TILEK: cudnnBackendKnobType_t = cudnnBackendKnobType_t(13);
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_STAGES: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        14,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_REDUCTION_MODE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        15,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        16,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_SPLIT_K_SLC: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        17,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_IDX_MODE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        18,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_SLICED: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        19,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_SPLIT_RS: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        20,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_SINGLEBUFFER: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        21,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_LDGC: cudnnBackendKnobType_t = cudnnBackendKnobType_t(22);
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_SPECFILT: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        23,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_KERNEL_CFG: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        24,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_WORKSPACE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        25,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_TILE_CGA: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        26,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_TILE_CGA_M: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        27,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_TILE_CGA_N: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        28,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_BLOCK_SIZE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        29,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_OCCUPANCY: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        30,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_ARRAY_SIZE_PER_THREAD: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        31,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_NUM_C_PER_BLOCK: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        32,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_SPLIT_COLS: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        33,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_TILE_ROWS: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        34,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_TILE_COLS: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        35,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_LOAD_SIZE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        36,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_COUNTS: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        37,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnBackendKnobType_t(pub ::core::ffi::c_uint);
+pub use super::cudnn::cudnnBackendLayoutType_t;
+pub use super::cudnn::cudnnBackendHeurMode_t;
+pub use super::cudnn9::cudnnBackendTensorReordering_t;
+pub use super::cudnn::cudnnPaddingMode_t;
+pub use super::cudnn9::cudnnBackendNormMode_t;
+pub use super::cudnn::cudnnBackendNormFwdPhase_t;
diff --git a/cuda_types/src/cudnn9.rs b/cuda_types/src/cudnn9.rs
new file mode 100644
index 0000000..e629c15
--- /dev/null
+++ b/cuda_types/src/cudnn9.rs
@@ -0,0 +1,2404 @@
+// Generated automatically by zluda_bindgen
+// DO NOT EDIT MANUALLY
+#![allow(warnings)]
+pub type __half = u16;
+pub type __nv_bfloat16 = u16;
+pub use super::cuda::cuComplex;
+pub use super::cuda::cuDoubleComplex;
+pub use super::cuda::cudaDataType;
+pub use super::cuda::cudaDataType_t;
+pub type cudaStream_t = super::cuda::CUstream;
+pub use super::cuda::libraryPropertyType;
+pub type cudaGraphExecUpdateResultInfo_st = super::cuda::CUgraphExecUpdateResultInfo_st;
+pub type cudaAsyncNotificationType = super::cuda::CUasyncNotificationType_enum;
+pub type cudaGraph_t = super::cuda::CUgraph;
+pub const CUDNN_MAJOR: u32 = 9;
+pub const CUDNN_MINOR: u32 = 8;
+pub const CUDNN_PATCHLEVEL: u32 = 0;
+pub const CUDNN_VERSION: u32 = 90800;
+pub const CUDNN_MAX_SM_MAJOR_NUMBER: u32 = 12;
+pub const CUDNN_MAX_SM_MINOR_NUMBER: u32 = 0;
+pub const CUDNN_MAX_DEVICE_VERSION: u32 = 1200;
+pub const CUDNN_GRAPH_MAJOR: u32 = 9;
+pub const CUDNN_GRAPH_MINOR: u32 = 8;
+pub const CUDNN_GRAPH_PATCH: u32 = 0;
+pub const CUDNN_DIM_MAX: u32 = 8;
+pub const CUDNN_OPS_MAJOR: u32 = 9;
+pub const CUDNN_OPS_MINOR: u32 = 8;
+pub const CUDNN_OPS_PATCH: u32 = 0;
+pub const CUDNN_LRN_MIN_N: u32 = 1;
+pub const CUDNN_LRN_MAX_N: u32 = 16;
+pub const CUDNN_LRN_MIN_K: f64 = 0.00001;
+pub const CUDNN_LRN_MIN_BETA: f64 = 0.01;
+pub const CUDNN_BN_MIN_EPSILON: f64 = 0.0;
+pub const CUDNN_ADV_MAJOR: u32 = 9;
+pub const CUDNN_ADV_MINOR: u32 = 8;
+pub const CUDNN_ADV_PATCH: u32 = 0;
+pub const CUDNN_RNN_PADDED_IO_DISABLED: u32 = 0;
+pub const CUDNN_RNN_PADDED_IO_ENABLED: u32 = 1;
+pub const CUDNN_SEQDATA_DIM_COUNT: u32 = 4;
+pub const CUDNN_ATTN_QUERYMAP_ALL_TO_ONE: u32 = 0;
+pub const CUDNN_ATTN_QUERYMAP_ONE_TO_ONE: u32 = 1;
+pub const CUDNN_ATTN_DISABLE_PROJ_BIASES: u32 = 0;
+pub const CUDNN_ATTN_ENABLE_PROJ_BIASES: u32 = 2;
+pub const CUDNN_ATTN_WKIND_COUNT: u32 = 8;
+pub const CUDNN_CNN_MAJOR: u32 = 9;
+pub const CUDNN_CNN_MINOR: u32 = 8;
+pub const CUDNN_CNN_PATCH: u32 = 0;
+pub use super::cudnn::cudnnContext;
+pub type cudnnHandle_t = *mut cudnnContext;
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_SUCCESS: cudnnStatus_t = cudnnStatus_t(0);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_NOT_INITIALIZED: cudnnStatus_t = cudnnStatus_t(1001);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH: cudnnStatus_t = cudnnStatus_t(
+        1002,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_SERIALIZATION_VERSION_MISMATCH: cudnnStatus_t = cudnnStatus_t(
+        1003,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_DEPRECATED: cudnnStatus_t = cudnnStatus_t(1004);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_LICENSE_ERROR: cudnnStatus_t = cudnnStatus_t(1005);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_RUNTIME_IN_PROGRESS: cudnnStatus_t = cudnnStatus_t(1006);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_RUNTIME_FP_OVERFLOW: cudnnStatus_t = cudnnStatus_t(1007);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_SUBLIBRARY_LOADING_FAILED: cudnnStatus_t = cudnnStatus_t(
+        1008,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_BAD_PARAM: cudnnStatus_t = cudnnStatus_t(2000);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_BAD_PARAM_NULL_POINTER: cudnnStatus_t = cudnnStatus_t(2002);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_BAD_PARAM_MISALIGNED_POINTER: cudnnStatus_t = cudnnStatus_t(
+        2003,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_BAD_PARAM_NOT_FINALIZED: cudnnStatus_t = cudnnStatus_t(2004);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_BAD_PARAM_OUT_OF_BOUND: cudnnStatus_t = cudnnStatus_t(2005);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_BAD_PARAM_SIZE_INSUFFICIENT: cudnnStatus_t = cudnnStatus_t(
+        2006,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_BAD_PARAM_STREAM_MISMATCH: cudnnStatus_t = cudnnStatus_t(
+        2007,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_BAD_PARAM_SHAPE_MISMATCH: cudnnStatus_t = cudnnStatus_t(2008);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_BAD_PARAM_DUPLICATED_ENTRIES: cudnnStatus_t = cudnnStatus_t(
+        2009,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_BAD_PARAM_ATTRIBUTE_TYPE: cudnnStatus_t = cudnnStatus_t(2010);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_BAD_PARAM_CUDA_GRAPH_MISMATCH: cudnnStatus_t = cudnnStatus_t(
+        2011,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_BAD_PARAM_DESCRIPTOR_TYPE: cudnnStatus_t = cudnnStatus_t(
+        2012,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_NOT_SUPPORTED: cudnnStatus_t = cudnnStatus_t(3000);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_NOT_SUPPORTED_GRAPH_PATTERN: cudnnStatus_t = cudnnStatus_t(
+        3001,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_NOT_SUPPORTED_SHAPE: cudnnStatus_t = cudnnStatus_t(3002);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_NOT_SUPPORTED_DATA_TYPE: cudnnStatus_t = cudnnStatus_t(3003);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_NOT_SUPPORTED_LAYOUT: cudnnStatus_t = cudnnStatus_t(3004);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDA_DRIVER: cudnnStatus_t = cudnnStatus_t(
+        3005,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDART: cudnnStatus_t = cudnnStatus_t(
+        3006,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH: cudnnStatus_t = cudnnStatus_t(
+        3007,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING: cudnnStatus_t = cudnnStatus_t(
+        3008,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_NOT_SUPPORTED_SUBLIBRARY_UNAVAILABLE: cudnnStatus_t = cudnnStatus_t(
+        3009,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_NOT_SUPPORTED_SHARED_MEMORY_INSUFFICIENT: cudnnStatus_t = cudnnStatus_t(
+        3010,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_NOT_SUPPORTED_PADDING: cudnnStatus_t = cudnnStatus_t(3011);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_NOT_SUPPORTED_BAD_LAUNCH_PARAM: cudnnStatus_t = cudnnStatus_t(
+        3012,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_NOT_SUPPORTED_CUDA_GRAPH_NATIVE_API: cudnnStatus_t = cudnnStatus_t(
+        3013,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_INTERNAL_ERROR: cudnnStatus_t = cudnnStatus_t(4000);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_INTERNAL_ERROR_COMPILATION_FAILED: cudnnStatus_t = cudnnStatus_t(
+        4001,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_INTERNAL_ERROR_UNEXPECTED_VALUE: cudnnStatus_t = cudnnStatus_t(
+        4002,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED: cudnnStatus_t = cudnnStatus_t(
+        4003,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_INTERNAL_ERROR_DEVICE_ALLOCATION_FAILED: cudnnStatus_t = cudnnStatus_t(
+        4004,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_INTERNAL_ERROR_BAD_LAUNCH_PARAM: cudnnStatus_t = cudnnStatus_t(
+        4005,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED: cudnnStatus_t = cudnnStatus_t(
+        4006,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_EXECUTION_FAILED: cudnnStatus_t = cudnnStatus_t(5000);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_EXECUTION_FAILED_CUDA_DRIVER: cudnnStatus_t = cudnnStatus_t(
+        5001,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_EXECUTION_FAILED_CUBLAS: cudnnStatus_t = cudnnStatus_t(5002);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_EXECUTION_FAILED_CUDART: cudnnStatus_t = cudnnStatus_t(5003);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_EXECUTION_FAILED_CURAND: cudnnStatus_t = cudnnStatus_t(5004);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_ALLOC_FAILED: cudnnStatus_t = cudnnStatus_t(4003);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_INVALID_VALUE: cudnnStatus_t = cudnnStatus_t(2001);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_ARCH_MISMATCH: cudnnStatus_t = cudnnStatus_t(3007);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_MAPPING_ERROR: cudnnStatus_t = cudnnStatus_t(4006);
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING: cudnnStatus_t = cudnnStatus_t(
+        3008,
+    );
+}
+impl cudnnStatus_t {
+    pub const CUDNN_STATUS_VERSION_MISMATCH: cudnnStatus_t = cudnnStatus_t(1002);
+}
+#[repr(transparent)]
+#[must_use]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnStatus_t(pub ::core::ffi::c_uint);
+pub use super::cudnn::cudnnRuntimeTag_t;
+pub use super::cudnn::cudnnErrQueryMode_t;
+impl cudnnDataType_t {
+    pub const CUDNN_DATA_FLOAT: cudnnDataType_t = cudnnDataType_t(0);
+}
+impl cudnnDataType_t {
+    pub const CUDNN_DATA_DOUBLE: cudnnDataType_t = cudnnDataType_t(1);
+}
+impl cudnnDataType_t {
+    pub const CUDNN_DATA_HALF: cudnnDataType_t = cudnnDataType_t(2);
+}
+impl cudnnDataType_t {
+    pub const CUDNN_DATA_INT8: cudnnDataType_t = cudnnDataType_t(3);
+}
+impl cudnnDataType_t {
+    pub const CUDNN_DATA_INT32: cudnnDataType_t = cudnnDataType_t(4);
+}
+impl cudnnDataType_t {
+    pub const CUDNN_DATA_INT8x4: cudnnDataType_t = cudnnDataType_t(5);
+}
+impl cudnnDataType_t {
+    pub const CUDNN_DATA_UINT8: cudnnDataType_t = cudnnDataType_t(6);
+}
+impl cudnnDataType_t {
+    pub const CUDNN_DATA_UINT8x4: cudnnDataType_t = cudnnDataType_t(7);
+}
+impl cudnnDataType_t {
+    pub const CUDNN_DATA_INT8x32: cudnnDataType_t = cudnnDataType_t(8);
+}
+impl cudnnDataType_t {
+    pub const CUDNN_DATA_BFLOAT16: cudnnDataType_t = cudnnDataType_t(9);
+}
+impl cudnnDataType_t {
+    pub const CUDNN_DATA_INT64: cudnnDataType_t = cudnnDataType_t(10);
+}
+impl cudnnDataType_t {
+    pub const CUDNN_DATA_BOOLEAN: cudnnDataType_t = cudnnDataType_t(11);
+}
+impl cudnnDataType_t {
+    pub const CUDNN_DATA_FP8_E4M3: cudnnDataType_t = cudnnDataType_t(12);
+}
+impl cudnnDataType_t {
+    pub const CUDNN_DATA_FP8_E5M2: cudnnDataType_t = cudnnDataType_t(13);
+}
+impl cudnnDataType_t {
+    pub const CUDNN_DATA_FAST_FLOAT_FOR_FP8: cudnnDataType_t = cudnnDataType_t(14);
+}
+impl cudnnDataType_t {
+    pub const CUDNN_DATA_FP8_E8M0: cudnnDataType_t = cudnnDataType_t(15);
+}
+impl cudnnDataType_t {
+    pub const CUDNN_DATA_FP4_E2M1: cudnnDataType_t = cudnnDataType_t(16);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnDataType_t(pub ::core::ffi::c_uint);
+pub use super::cudnn::cudnnMathType_t;
+pub use super::cudnn::cudnnNanPropagation_t;
+impl cudnnCTCGradMode_t {
+    pub const CUDNN_CTC_ZERO_OOB_GRADIENTS: cudnnCTCGradMode_t = cudnnCTCGradMode_t(0);
+}
+impl cudnnCTCGradMode_t {
+    pub const CUDNN_CTC_SKIP_OOB_GRADIENTS: cudnnCTCGradMode_t = cudnnCTCGradMode_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnCTCGradMode_t(pub ::core::ffi::c_uint);
+pub use super::cudnn::cudnnTensorFormat_t;
+pub use super::cudnn::cudnnReduceTensorOp_t;
+pub use super::cudnn::cudnnActivationMode_t;
+pub use super::cudnn::cudnnSeverity_t;
+#[repr(C)]
+pub struct cudnnDebugStruct {
+    pub cudnn_version: ::core::ffi::c_uint,
+    pub cudnnStatus: cudnnStatus_t,
+    pub time_sec: ::core::ffi::c_uint,
+    pub time_usec: ::core::ffi::c_uint,
+    pub time_delta: ::core::ffi::c_uint,
+    pub handle: cudnnHandle_t,
+    pub stream: cudaStream_t,
+    pub pid: ::core::ffi::c_ulonglong,
+    pub tid: ::core::ffi::c_ulonglong,
+    pub cudaDeviceId: ::core::ffi::c_int,
+    pub reserved: [::core::ffi::c_int; 15usize],
+}
+pub type cudnnDebug_t = cudnnDebugStruct;
+pub type cudnnCallback_t = ::core::option::Option<
+    unsafe extern "C" fn(
+        sev: cudnnSeverity_t,
+        udata: *mut ::core::ffi::c_void,
+        dbg: *const cudnnDebug_t,
+        msg: *const ::core::ffi::c_char,
+    ),
+>;
+pub use super::cudnn::cudnnConvolutionMode_t;
+pub use super::cudnn::cudnnReorderType_t;
+pub type cudnnBackendDescriptor_t = *mut ::core::ffi::c_void;
+pub use super::cudnn::cudnnFractionStruct;
+pub type cudnnFraction_t = cudnnFractionStruct;
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_ADD: cudnnPointwiseMode_t = cudnnPointwiseMode_t(0);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_ADD_SQUARE: cudnnPointwiseMode_t = cudnnPointwiseMode_t(5);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_DIV: cudnnPointwiseMode_t = cudnnPointwiseMode_t(6);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_MAX: cudnnPointwiseMode_t = cudnnPointwiseMode_t(3);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_MIN: cudnnPointwiseMode_t = cudnnPointwiseMode_t(2);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_MOD: cudnnPointwiseMode_t = cudnnPointwiseMode_t(7);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_MUL: cudnnPointwiseMode_t = cudnnPointwiseMode_t(1);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_POW: cudnnPointwiseMode_t = cudnnPointwiseMode_t(8);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_SUB: cudnnPointwiseMode_t = cudnnPointwiseMode_t(9);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_ABS: cudnnPointwiseMode_t = cudnnPointwiseMode_t(10);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_CEIL: cudnnPointwiseMode_t = cudnnPointwiseMode_t(11);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_COS: cudnnPointwiseMode_t = cudnnPointwiseMode_t(12);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_EXP: cudnnPointwiseMode_t = cudnnPointwiseMode_t(13);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_FLOOR: cudnnPointwiseMode_t = cudnnPointwiseMode_t(14);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_LOG: cudnnPointwiseMode_t = cudnnPointwiseMode_t(15);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_NEG: cudnnPointwiseMode_t = cudnnPointwiseMode_t(16);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_RSQRT: cudnnPointwiseMode_t = cudnnPointwiseMode_t(17);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_SIN: cudnnPointwiseMode_t = cudnnPointwiseMode_t(18);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_SQRT: cudnnPointwiseMode_t = cudnnPointwiseMode_t(4);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_TAN: cudnnPointwiseMode_t = cudnnPointwiseMode_t(19);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_ERF: cudnnPointwiseMode_t = cudnnPointwiseMode_t(20);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_IDENTITY: cudnnPointwiseMode_t = cudnnPointwiseMode_t(21);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_RECIPROCAL: cudnnPointwiseMode_t = cudnnPointwiseMode_t(
+        22,
+    );
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_ATAN2: cudnnPointwiseMode_t = cudnnPointwiseMode_t(23);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_RELU_FWD: cudnnPointwiseMode_t = cudnnPointwiseMode_t(100);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_TANH_FWD: cudnnPointwiseMode_t = cudnnPointwiseMode_t(101);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_SIGMOID_FWD: cudnnPointwiseMode_t = cudnnPointwiseMode_t(
+        102,
+    );
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_ELU_FWD: cudnnPointwiseMode_t = cudnnPointwiseMode_t(103);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_GELU_FWD: cudnnPointwiseMode_t = cudnnPointwiseMode_t(104);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_SOFTPLUS_FWD: cudnnPointwiseMode_t = cudnnPointwiseMode_t(
+        105,
+    );
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_SWISH_FWD: cudnnPointwiseMode_t = cudnnPointwiseMode_t(
+        106,
+    );
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_GELU_APPROX_TANH_FWD: cudnnPointwiseMode_t = cudnnPointwiseMode_t(
+        107,
+    );
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_RELU_BWD: cudnnPointwiseMode_t = cudnnPointwiseMode_t(200);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_TANH_BWD: cudnnPointwiseMode_t = cudnnPointwiseMode_t(201);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_SIGMOID_BWD: cudnnPointwiseMode_t = cudnnPointwiseMode_t(
+        202,
+    );
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_ELU_BWD: cudnnPointwiseMode_t = cudnnPointwiseMode_t(203);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_GELU_BWD: cudnnPointwiseMode_t = cudnnPointwiseMode_t(204);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_SOFTPLUS_BWD: cudnnPointwiseMode_t = cudnnPointwiseMode_t(
+        205,
+    );
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_SWISH_BWD: cudnnPointwiseMode_t = cudnnPointwiseMode_t(
+        206,
+    );
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_GELU_APPROX_TANH_BWD: cudnnPointwiseMode_t = cudnnPointwiseMode_t(
+        207,
+    );
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_CMP_EQ: cudnnPointwiseMode_t = cudnnPointwiseMode_t(300);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_CMP_NEQ: cudnnPointwiseMode_t = cudnnPointwiseMode_t(301);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_CMP_GT: cudnnPointwiseMode_t = cudnnPointwiseMode_t(302);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_CMP_GE: cudnnPointwiseMode_t = cudnnPointwiseMode_t(303);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_CMP_LT: cudnnPointwiseMode_t = cudnnPointwiseMode_t(304);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_CMP_LE: cudnnPointwiseMode_t = cudnnPointwiseMode_t(305);
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_LOGICAL_AND: cudnnPointwiseMode_t = cudnnPointwiseMode_t(
+        400,
+    );
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_LOGICAL_OR: cudnnPointwiseMode_t = cudnnPointwiseMode_t(
+        401,
+    );
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_LOGICAL_NOT: cudnnPointwiseMode_t = cudnnPointwiseMode_t(
+        402,
+    );
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_GEN_INDEX: cudnnPointwiseMode_t = cudnnPointwiseMode_t(
+        501,
+    );
+}
+impl cudnnPointwiseMode_t {
+    pub const CUDNN_POINTWISE_BINARY_SELECT: cudnnPointwiseMode_t = cudnnPointwiseMode_t(
+        601,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnPointwiseMode_t(pub ::core::ffi::c_uint);
+pub use super::cudnn::cudnnResampleMode_t;
+pub use super::cudnn::cudnnSignalMode_t;
+pub use super::cudnn::cudnnGenStatsMode_t;
+pub use super::cudnn::cudnnBnFinalizeStatsMode_t;
+pub use super::cudnn::cudnnRngDistribution_t;
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_POINTWISE_MODE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        0,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_POINTWISE_MATH_PREC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_POINTWISE_NAN_PROPAGATION: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        3,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        4,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        5,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_POINTWISE_ELU_ALPHA: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        6,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        7,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_POINTWISE_SWISH_BETA: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        8,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_POINTWISE_AXIS: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        9,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_CONVOLUTION_COMP_TYPE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        100,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_CONVOLUTION_CONV_MODE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        101,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_CONVOLUTION_DILATIONS: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        102,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        103,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_CONVOLUTION_POST_PADDINGS: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        104,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        105,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        106,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_ENGINEHEUR_MODE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        200,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        201,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_ENGINEHEUR_RESULTS: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        202,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_ENGINEHEUR_SM_COUNT_TARGET: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        203,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_ENGINEHEUR_DEVICEPROP: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        204,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_ENGINECFG_ENGINE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        300,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        301,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_ENGINECFG_KNOB_CHOICES: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        302,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_ENGINECFG_WORKSPACE_SIZE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        303,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_ENGINECFG_SHARED_MEMORY_USED: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        304,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_EXECUTION_PLAN_HANDLE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        400,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        401,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        402,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        403,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        404,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        405,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_EXECUTION_PLAN_KERNEL_CACHE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        406,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_EXECUTION_PLAN_DEVICEPROP: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        407,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        500,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_INTERMEDIATE_INFO_SIZE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        501,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        502,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        503,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        600,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        601,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        700,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        701,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        702,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        703,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        704,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        705,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        706,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        707,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        708,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        709,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        710,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        711,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        712,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        713,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        714,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        715,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        716,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        717,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        750,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_POINTWISE_XDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        751,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_POINTWISE_BDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        752,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_POINTWISE_YDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        753,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        754,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        755,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_POINTWISE_DXDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        756,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_POINTWISE_DYDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        757,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_POINTWISE_TDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        758,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_GENSTATS_MODE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        770,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        771,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_GENSTATS_XDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        772,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        773,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        774,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        780,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        781,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        782,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        783,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        784,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        785,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        786,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        787,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        788,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        789,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        790,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        791,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        792,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        793,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        794,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        795,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        796,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATIONGRAPH_HANDLE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        800,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATIONGRAPH_OPS: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        801,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        802,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATIONGRAPH_IS_DYNAMIC_SHAPE_ENABLED: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        803,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATIONGRAPH_IS_SAME_TOPOLOGY: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        804,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        900,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_TENSOR_DATA_TYPE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        901,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_TENSOR_DIMENSIONS: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        902,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_TENSOR_STRIDES: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        903,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_TENSOR_VECTOR_COUNT: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        904,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        905,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_TENSOR_UNIQUE_ID: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        906,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_TENSOR_IS_VIRTUAL: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        907,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_TENSOR_IS_BY_VALUE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        908,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_TENSOR_REORDERING_MODE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        909,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        913,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1000,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1001,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1002,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_VARIANT_PACK_WORKSPACE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1003,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1100,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_LAYOUT_INFO_TYPES: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1101,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_KNOB_INFO_TYPE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1200,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1201,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1202,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_KNOB_INFO_STRIDE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1203,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_ENGINE_OPERATION_GRAPH: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1300,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_ENGINE_GLOBAL_INDEX: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1301,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_ENGINE_KNOB_INFO: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1302,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_ENGINE_NUMERICAL_NOTE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1303,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_ENGINE_LAYOUT_INFO: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1304,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1305,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_ENGINE_SM_COUNT_TARGET: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1306,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_ENGINE_DEVICEPROP: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1307,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_MATMUL_COMP_TYPE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1500,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_MATMUL_PADDING_VALUE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1503,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_MATMUL_ADESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1520,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_MATMUL_BDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1521,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_MATMUL_CDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1522,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_MATMUL_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1523,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1524,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1525,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1526,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1527,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_REDUCTION_OPERATOR: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1600,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_REDUCTION_COMP_TYPE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1601,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_REDUCTION_XDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1610,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_REDUCTION_YDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1611,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_REDUCTION_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1612,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1620,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1621,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1622,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1623,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1624,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1625,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1626,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1627,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1628,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1629,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1630,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_RESAMPLE_MODE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1700,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_RESAMPLE_COMP_TYPE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1701,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1702,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_RESAMPLE_POST_PADDINGS: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1703,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_RESAMPLE_PRE_PADDINGS: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1704,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_RESAMPLE_STRIDES: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1705,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_RESAMPLE_WINDOW_DIMS: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1706,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1707,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_RESAMPLE_PADDING_MODE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1708,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1710,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1711,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1712,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1713,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1714,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1716,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1720,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1721,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1722,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1723,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1724,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1725,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1726,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1727,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_CONCAT_AXIS: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1800,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1801,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1802,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1803,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_SIGNAL_MODE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1900,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1901,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_SIGNAL_VALUE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1902,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_SIGNAL_XDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1903,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_SIGNAL_YDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1904,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_CONTAINER_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1950,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_YDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1951,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_SEQUENCE_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1952,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_PAGE_TABLE_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        1953,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_FWD_MODE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2000,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_FWD_PHASE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2001,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_FWD_XDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2002,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2003,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2004,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2005,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2006,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2007,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2008,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2009,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2010,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2011,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2012,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_FWD_YDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2013,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2014,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_BWD_MODE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2100,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_BWD_XDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2101,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2102,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2103,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2104,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2105,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2106,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2107,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2108,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2109,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2110,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_RESHAPE_XDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2200,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_RESHAPE_YDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2201,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_RNG_DISTRIBUTION: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2300,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_RNG_NORMAL_DIST_MEAN: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2301,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2302,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2303,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2304,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2305,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_RNG_YDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2310,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_RNG_SEED: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2311,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_RNG_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2312,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2313,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_KERNEL_CACHE_OPERATION_GRAPH: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2400,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_KERNEL_CACHE_IS_ENGINECFG_KERNEL_CACHED: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2401,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_XDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2500,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_YDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2501,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_SCALE_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2502,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_MATH_PREC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2503,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_BLOCK_SIZE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2504,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_XDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2600,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_SCALE_DESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2601,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_YDESC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2602,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_MATH_PREC: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2603,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_BLOCK_SIZE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2604,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_DEVICEPROP_DEVICE_ID: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2700,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_DEVICEPROP_HANDLE: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2701,
+    );
+}
+impl cudnnBackendAttributeName_t {
+    pub const CUDNN_ATTR_DEVICEPROP_JSON_REPRESENTATION: cudnnBackendAttributeName_t = cudnnBackendAttributeName_t(
+        2702,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnBackendAttributeName_t(pub ::core::ffi::c_uint);
+pub use super::cudnn::cudnnBackendAttributeType_t;
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_POINTWISE_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        0,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        1,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_ENGINE_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        2,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_ENGINECFG_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        3,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        4,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        5,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        6,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        7,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        8,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        9,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        10,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        11,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        12,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        13,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        14,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        15,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        16,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_TENSOR_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        17,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_MATMUL_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        18,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        19,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        20,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_REDUCTION_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        21,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        22,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        23,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_RESAMPLE_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        24,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        25,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        26,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        27,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        28,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        29,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        30,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        31,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_RNG_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        32,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        33,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        34,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        35,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        36,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        37,
+    );
+}
+impl cudnnBackendDescriptorType_t {
+    pub const CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR: cudnnBackendDescriptorType_t = cudnnBackendDescriptorType_t(
+        38,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnBackendDescriptorType_t(pub ::core::ffi::c_uint);
+impl cudnnBackendNumericalNote_t {
+    pub const CUDNN_NUMERICAL_NOTE_TENSOR_CORE: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
+        0,
+    );
+}
+impl cudnnBackendNumericalNote_t {
+    pub const CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
+        1,
+    );
+}
+impl cudnnBackendNumericalNote_t {
+    pub const CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
+        2,
+    );
+}
+impl cudnnBackendNumericalNote_t {
+    pub const CUDNN_NUMERICAL_NOTE_FFT: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
+        3,
+    );
+}
+impl cudnnBackendNumericalNote_t {
+    pub const CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
+        4,
+    );
+}
+impl cudnnBackendNumericalNote_t {
+    pub const CUDNN_NUMERICAL_NOTE_WINOGRAD: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
+        5,
+    );
+}
+impl cudnnBackendNumericalNote_t {
+    pub const CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
+        6,
+    );
+}
+impl cudnnBackendNumericalNote_t {
+    pub const CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
+        7,
+    );
+}
+impl cudnnBackendNumericalNote_t {
+    pub const CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
+        8,
+    );
+}
+impl cudnnBackendNumericalNote_t {
+    pub const CUDNN_NUMERICAL_NOTE_STRICT_NAN_PROP: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
+        9,
+    );
+}
+impl cudnnBackendNumericalNote_t {
+    pub const CUDNN_NUMERICAL_NOTE_TYPE_COUNT: cudnnBackendNumericalNote_t = cudnnBackendNumericalNote_t(
+        10,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnBackendNumericalNote_t(pub ::core::ffi::c_uint);
+impl cudnnBackendBehaviorNote_t {
+    pub const CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION: cudnnBackendBehaviorNote_t = cudnnBackendBehaviorNote_t(
+        0,
+    );
+}
+impl cudnnBackendBehaviorNote_t {
+    pub const CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER: cudnnBackendBehaviorNote_t = cudnnBackendBehaviorNote_t(
+        1,
+    );
+}
+impl cudnnBackendBehaviorNote_t {
+    pub const CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER: cudnnBackendBehaviorNote_t = cudnnBackendBehaviorNote_t(
+        2,
+    );
+}
+impl cudnnBackendBehaviorNote_t {
+    pub const CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API: cudnnBackendBehaviorNote_t = cudnnBackendBehaviorNote_t(
+        3,
+    );
+}
+impl cudnnBackendBehaviorNote_t {
+    pub const CUDNN_BEHAVIOR_NOTE_TYPE_COUNT: cudnnBackendBehaviorNote_t = cudnnBackendBehaviorNote_t(
+        4,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnBackendBehaviorNote_t(pub ::core::ffi::c_uint);
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_SPLIT_K: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        0,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_SWIZZLE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        1,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_TILE_SIZE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        2,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_USE_TEX: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        3,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_EDGE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(4);
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_KBLOCK: cudnnBackendKnobType_t = cudnnBackendKnobType_t(5);
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_LDGA: cudnnBackendKnobType_t = cudnnBackendKnobType_t(6);
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_LDGB: cudnnBackendKnobType_t = cudnnBackendKnobType_t(7);
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_CHUNK_K: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        8,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_SPLIT_H: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        9,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_WINO_TILE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        10,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_MULTIPLY: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        11,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_SPLIT_K_BUF: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        12,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_TILEK: cudnnBackendKnobType_t = cudnnBackendKnobType_t(13);
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_STAGES: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        14,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_REDUCTION_MODE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        15,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        16,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_SPLIT_K_SLC: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        17,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_IDX_MODE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        18,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_SLICED: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        19,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_SPLIT_RS: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        20,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_SINGLEBUFFER: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        21,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_LDGC: cudnnBackendKnobType_t = cudnnBackendKnobType_t(22);
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_SPECFILT: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        23,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_KERNEL_CFG: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        24,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_WORKSPACE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        25,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_TILE_CGA: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        26,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_TILE_CGA_M: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        27,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_TILE_CGA_N: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        28,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_BLOCK_SIZE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        29,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_OCCUPANCY: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        30,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_ARRAY_SIZE_PER_THREAD: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        31,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_NUM_C_PER_BLOCK: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        32,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_SPLIT_COLS: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        33,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_TILE_ROWS: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        34,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_TILE_COLS: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        35,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_LOAD_SIZE: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        36,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_CTA_COUNT: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        37,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_STREAM_K: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        38,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_SPLIT_P_SLC: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        39,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_TILE_M: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        40,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_TILE_N: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        41,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_WARP_SPEC_CFG: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        42,
+    );
+}
+impl cudnnBackendKnobType_t {
+    pub const CUDNN_KNOB_TYPE_COUNTS: cudnnBackendKnobType_t = cudnnBackendKnobType_t(
+        43,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnBackendKnobType_t(pub ::core::ffi::c_uint);
+pub use super::cudnn::cudnnBackendLayoutType_t;
+pub use super::cudnn::cudnnBackendHeurMode_t;
+impl cudnnBackendTensorReordering_t {
+    pub const CUDNN_TENSOR_REORDERING_NONE: cudnnBackendTensorReordering_t = cudnnBackendTensorReordering_t(
+        0,
+    );
+}
+impl cudnnBackendTensorReordering_t {
+    pub const CUDNN_TENSOR_REORDERING_INT8x32: cudnnBackendTensorReordering_t = cudnnBackendTensorReordering_t(
+        1,
+    );
+}
+impl cudnnBackendTensorReordering_t {
+    pub const CUDNN_TENSOR_REORDERING_F16x16: cudnnBackendTensorReordering_t = cudnnBackendTensorReordering_t(
+        2,
+    );
+}
+impl cudnnBackendTensorReordering_t {
+    pub const CUDNN_TENSOR_REORDERING_F8_128x4: cudnnBackendTensorReordering_t = cudnnBackendTensorReordering_t(
+        3,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnBackendTensorReordering_t(pub ::core::ffi::c_uint);
+pub use super::cudnn::cudnnPaddingMode_t;
+impl cudnnBackendNormMode_t {
+    pub const CUDNN_LAYER_NORM: cudnnBackendNormMode_t = cudnnBackendNormMode_t(0);
+}
+impl cudnnBackendNormMode_t {
+    pub const CUDNN_INSTANCE_NORM: cudnnBackendNormMode_t = cudnnBackendNormMode_t(1);
+}
+impl cudnnBackendNormMode_t {
+    pub const CUDNN_BATCH_NORM: cudnnBackendNormMode_t = cudnnBackendNormMode_t(2);
+}
+impl cudnnBackendNormMode_t {
+    pub const CUDNN_GROUP_NORM: cudnnBackendNormMode_t = cudnnBackendNormMode_t(3);
+}
+impl cudnnBackendNormMode_t {
+    pub const CUDNN_RMS_NORM: cudnnBackendNormMode_t = cudnnBackendNormMode_t(4);
+}
+impl cudnnBackendNormMode_t {
+    pub const CUDNN_ADA_LAYER_NORM: cudnnBackendNormMode_t = cudnnBackendNormMode_t(5);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudnnBackendNormMode_t(pub ::core::ffi::c_uint);
+pub use super::cudnn::cudnnBackendNormFwdPhase_t;
+pub use super::cudnn::cudnnTensorStruct;
+pub type cudnnTensorDescriptor_t = *mut cudnnTensorStruct;
+pub use super::cudnn::cudnnPoolingStruct;
+pub type cudnnPoolingDescriptor_t = *mut cudnnPoolingStruct;
+pub use super::cudnn::cudnnFilterStruct;
+pub type cudnnFilterDescriptor_t = *mut cudnnFilterStruct;
+pub use super::cudnn::cudnnLRNStruct;
+pub type cudnnLRNDescriptor_t = *mut cudnnLRNStruct;
+pub use super::cudnn::cudnnActivationStruct;
+pub type cudnnActivationDescriptor_t = *mut cudnnActivationStruct;
+pub use super::cudnn::cudnnSpatialTransformerStruct;
+pub type cudnnSpatialTransformerDescriptor_t = *mut cudnnSpatialTransformerStruct;
+pub use super::cudnn::cudnnOpTensorStruct;
+pub type cudnnOpTensorDescriptor_t = *mut cudnnOpTensorStruct;
+pub use super::cudnn::cudnnReduceTensorStruct;
+pub type cudnnReduceTensorDescriptor_t = *mut cudnnReduceTensorStruct;
+pub use super::cudnn::cudnnCTCLossStruct;
+pub type cudnnCTCLossDescriptor_t = *mut cudnnCTCLossStruct;
+pub use super::cudnn::cudnnTensorTransformStruct;
+pub type cudnnTensorTransformDescriptor_t = *mut cudnnTensorTransformStruct;
+pub use super::cudnn::cudnnDeterminism_t;
+pub use super::cudnn::cudnnFoldingDirection_t;
+pub use super::cudnn::cudnnOpTensorOp_t;
+pub use super::cudnn::cudnnReduceTensorIndices_t;
+pub use super::cudnn::cudnnIndicesType_t;
+pub use super::cudnn::cudnnSoftmaxAlgorithm_t;
+pub use super::cudnn::cudnnSoftmaxMode_t;
+pub use super::cudnn::cudnnPoolingMode_t;
+pub use super::cudnn::cudnnLRNMode_t;
+pub use super::cudnn::cudnnDivNormMode_t;
+pub use super::cudnn::cudnnBatchNormMode_t;
+pub use super::cudnn::cudnnBatchNormOps_t;
+pub use super::cudnn::cudnnNormMode_t;
+pub use super::cudnn::cudnnNormAlgo_t;
+pub use super::cudnn::cudnnNormOps_t;
+pub use super::cudnn::cudnnSamplerType_t;
+pub use super::cudnn::cudnnDropoutStruct;
+pub type cudnnDropoutDescriptor_t = *mut cudnnDropoutStruct;
+pub use super::cudnn::cudnnConvolutionFwdAlgo_t;
+pub use super::cudnn::cudnnConvolutionBwdFilterAlgo_t;
+pub use super::cudnn::cudnnConvolutionBwdDataAlgo_t;
+pub use super::cudnn::cudnnCTCLossAlgo_t;
+pub use super::cudnn::cudnnRNNAlgo_t;
+pub use super::cudnn::cudnnForwardMode_t;
+pub use super::cudnn::cudnnRNNMode_t;
+pub use super::cudnn::cudnnRNNBiasMode_t;
+pub use super::cudnn::cudnnDirectionMode_t;
+pub use super::cudnn::cudnnRNNInputMode_t;
+pub use super::cudnn::cudnnRNNClipMode_t;
+pub use super::cudnn::cudnnRNNDataLayout_t;
+pub use super::cudnn::cudnnRNNStruct;
+pub type cudnnRNNDescriptor_t = *mut cudnnRNNStruct;
+pub use super::cudnn::cudnnRNNDataStruct;
+pub type cudnnRNNDataDescriptor_t = *mut cudnnRNNDataStruct;
+pub use super::cudnn::cudnnSeqDataAxis_t;
+pub use super::cudnn::cudnnSeqDataStruct;
+pub type cudnnSeqDataDescriptor_t = *mut cudnnSeqDataStruct;
+pub use super::cudnn::cudnnAttnStruct;
+pub type cudnnAttnDescriptor_t = *mut cudnnAttnStruct;
+pub use super::cudnn::cudnnMultiHeadAttnWeightKind_t;
+pub use super::cudnn::cudnnWgradMode_t;
+pub use super::cudnn::cudnnLossNormalizationMode_t;
+pub use super::cudnn::cudnnConvolutionStruct;
+pub type cudnnConvolutionDescriptor_t = *mut cudnnConvolutionStruct;
+#[repr(C)]
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub struct cudnnConvolutionFwdAlgoPerfStruct {
+    pub algo: cudnnConvolutionFwdAlgo_t,
+    pub status: cudnnStatus_t,
+    pub time: f32,
+    pub memory: usize,
+    pub determinism: cudnnDeterminism_t,
+    pub mathType: cudnnMathType_t,
+    pub reserved: [::core::ffi::c_int; 3usize],
+}
+pub type cudnnConvolutionFwdAlgoPerf_t = cudnnConvolutionFwdAlgoPerfStruct;
+#[repr(C)]
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub struct cudnnConvolutionBwdDataAlgoPerfStruct {
+    pub algo: cudnnConvolutionBwdDataAlgo_t,
+    pub status: cudnnStatus_t,
+    pub time: f32,
+    pub memory: usize,
+    pub determinism: cudnnDeterminism_t,
+    pub mathType: cudnnMathType_t,
+    pub reserved: [::core::ffi::c_int; 3usize],
+}
+pub type cudnnConvolutionBwdDataAlgoPerf_t = cudnnConvolutionBwdDataAlgoPerfStruct;
+pub use super::cudnn::cudnnFusedOpsConstParamStruct;
+pub type cudnnFusedOpsConstParamPack_t = *mut cudnnFusedOpsConstParamStruct;
+pub use super::cudnn::cudnnFusedOpsVariantParamStruct;
+pub type cudnnFusedOpsVariantParamPack_t = *mut cudnnFusedOpsVariantParamStruct;
+pub use super::cudnn::cudnnFusedOpsPlanStruct;
+pub type cudnnFusedOpsPlan_t = *mut cudnnFusedOpsPlanStruct;
+pub use super::cudnn::cudnnFusedOps_t;
+pub use super::cudnn::cudnnFusedOpsConstParamLabel_t;
+pub use super::cudnn::cudnnFusedOpsPointerPlaceHolder_t;
+pub use super::cudnn::cudnnFusedOpsVariantParamLabel_t;
+#[repr(C)]
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub struct cudnnConvolutionBwdFilterAlgoPerfStruct {
+    pub algo: cudnnConvolutionBwdFilterAlgo_t,
+    pub status: cudnnStatus_t,
+    pub time: f32,
+    pub memory: usize,
+    pub determinism: cudnnDeterminism_t,
+    pub mathType: cudnnMathType_t,
+    pub reserved: [::core::ffi::c_int; 3usize],
+}
+pub type cudnnConvolutionBwdFilterAlgoPerf_t = cudnnConvolutionBwdFilterAlgoPerfStruct;
diff --git a/cuda_types/src/cufft.rs b/cuda_types/src/cufft.rs
new file mode 100644
index 0000000..a6d1833
--- /dev/null
+++ b/cuda_types/src/cufft.rs
@@ -0,0 +1,427 @@
+// Generated automatically by zluda_bindgen
+// DO NOT EDIT MANUALLY
+#![allow(warnings)]
+pub type __half = u16;
+pub type __nv_bfloat16 = u16;
+pub use super::cuda::cuComplex;
+pub use super::cuda::cuDoubleComplex;
+pub use super::cuda::cudaDataType;
+pub use super::cuda::cudaDataType_t;
+pub type cudaStream_t = super::cuda::CUstream;
+pub use super::cuda::libraryPropertyType;
+pub type cudaGraphExecUpdateResultInfo_st = super::cuda::CUgraphExecUpdateResultInfo_st;
+pub type cudaAsyncNotificationType = super::cuda::CUasyncNotificationType_enum;
+pub type cudaGraph_t = super::cuda::CUgraph;
+pub const CUFFT_VER_MAJOR: u32 = 11;
+pub const CUFFT_VER_MINOR: u32 = 3;
+pub const CUFFT_VER_PATCH: u32 = 3;
+pub const CUFFT_VER_BUILD: u32 = 83;
+pub const CUFFT_VERSION: u32 = 11303;
+pub const CUFFT_FORWARD: i32 = -1;
+pub const CUFFT_INVERSE: u32 = 1;
+impl libFormat_t {
+    pub const LIB_FORMAT_CUFFT: libFormat_t = libFormat_t(0);
+}
+impl libFormat_t {
+    pub const LIB_FORMAT_UNDEFINED: libFormat_t = libFormat_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct libFormat_t(pub ::core::ffi::c_uint);
+pub use self::libFormat_t as libFormat;
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudaXtDesc_t {
+    pub version: ::core::ffi::c_int,
+    pub nGPUs: ::core::ffi::c_int,
+    pub GPUs: [::core::ffi::c_int; 64usize],
+    pub data: [*mut ::core::ffi::c_void; 64usize],
+    pub size: [usize; 64usize],
+    pub cudaXtState: *mut ::core::ffi::c_void,
+}
+pub type cudaXtDesc = cudaXtDesc_t;
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudaLibXtDesc_t {
+    pub version: ::core::ffi::c_int,
+    pub descriptor: *mut cudaXtDesc,
+    pub library: libFormat,
+    pub subFormat: ::core::ffi::c_int,
+    pub libDescriptor: *mut ::core::ffi::c_void,
+}
+pub type cudaLibXtDesc = cudaLibXtDesc_t;
+impl cufftResult_t {
+    pub const CUFFT_SUCCESS: cufftResult_t = cufftResult_t(0);
+}
+impl cufftResult_t {
+    pub const CUFFT_INVALID_PLAN: cufftResult_t = cufftResult_t(1);
+}
+impl cufftResult_t {
+    pub const CUFFT_ALLOC_FAILED: cufftResult_t = cufftResult_t(2);
+}
+impl cufftResult_t {
+    pub const CUFFT_INVALID_TYPE: cufftResult_t = cufftResult_t(3);
+}
+impl cufftResult_t {
+    pub const CUFFT_INVALID_VALUE: cufftResult_t = cufftResult_t(4);
+}
+impl cufftResult_t {
+    pub const CUFFT_INTERNAL_ERROR: cufftResult_t = cufftResult_t(5);
+}
+impl cufftResult_t {
+    pub const CUFFT_EXEC_FAILED: cufftResult_t = cufftResult_t(6);
+}
+impl cufftResult_t {
+    pub const CUFFT_SETUP_FAILED: cufftResult_t = cufftResult_t(7);
+}
+impl cufftResult_t {
+    pub const CUFFT_INVALID_SIZE: cufftResult_t = cufftResult_t(8);
+}
+impl cufftResult_t {
+    pub const CUFFT_UNALIGNED_DATA: cufftResult_t = cufftResult_t(9);
+}
+impl cufftResult_t {
+    pub const CUFFT_INCOMPLETE_PARAMETER_LIST: cufftResult_t = cufftResult_t(10);
+}
+impl cufftResult_t {
+    pub const CUFFT_INVALID_DEVICE: cufftResult_t = cufftResult_t(11);
+}
+impl cufftResult_t {
+    pub const CUFFT_PARSE_ERROR: cufftResult_t = cufftResult_t(12);
+}
+impl cufftResult_t {
+    pub const CUFFT_NO_WORKSPACE: cufftResult_t = cufftResult_t(13);
+}
+impl cufftResult_t {
+    pub const CUFFT_NOT_IMPLEMENTED: cufftResult_t = cufftResult_t(14);
+}
+impl cufftResult_t {
+    pub const CUFFT_LICENSE_ERROR: cufftResult_t = cufftResult_t(15);
+}
+impl cufftResult_t {
+    pub const CUFFT_NOT_SUPPORTED: cufftResult_t = cufftResult_t(16);
+}
+#[repr(transparent)]
+#[must_use]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cufftResult_t(pub ::core::ffi::c_uint);
+pub use self::cufftResult_t as cufftResult;
+pub type cufftReal = f32;
+pub type cufftDoubleReal = f64;
+pub type cufftComplex = super::cuda::cuComplex;
+pub type cufftDoubleComplex = super::cuda::cuDoubleComplex;
+impl cufftType_t {
+    pub const CUFFT_R2C: cufftType_t = cufftType_t(42);
+}
+impl cufftType_t {
+    pub const CUFFT_C2R: cufftType_t = cufftType_t(44);
+}
+impl cufftType_t {
+    pub const CUFFT_C2C: cufftType_t = cufftType_t(41);
+}
+impl cufftType_t {
+    pub const CUFFT_D2Z: cufftType_t = cufftType_t(106);
+}
+impl cufftType_t {
+    pub const CUFFT_Z2D: cufftType_t = cufftType_t(108);
+}
+impl cufftType_t {
+    pub const CUFFT_Z2Z: cufftType_t = cufftType_t(105);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cufftType_t(pub ::core::ffi::c_uint);
+pub use self::cufftType_t as cufftType;
+impl cufftCompatibility_t {
+    pub const CUFFT_COMPATIBILITY_FFTW_PADDING: cufftCompatibility_t = cufftCompatibility_t(
+        1,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cufftCompatibility_t(pub ::core::ffi::c_uint);
+pub use self::cufftCompatibility_t as cufftCompatibility;
+pub type cufftHandle = ::core::ffi::c_int;
+impl cufftProperty_t {
+    pub const NVFFT_PLAN_PROPERTY_INT64_PATIENT_JIT: cufftProperty_t = cufftProperty_t(
+        1,
+    );
+}
+impl cufftProperty_t {
+    pub const NVFFT_PLAN_PROPERTY_INT64_MAX_NUM_HOST_THREADS: cufftProperty_t = cufftProperty_t(
+        2,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cufftProperty_t(pub ::core::ffi::c_uint);
+pub use self::cufftProperty_t as cufftProperty;
+impl cufftXtSubFormat_t {
+    pub const CUFFT_XT_FORMAT_INPUT: cufftXtSubFormat_t = cufftXtSubFormat_t(0);
+}
+impl cufftXtSubFormat_t {
+    pub const CUFFT_XT_FORMAT_OUTPUT: cufftXtSubFormat_t = cufftXtSubFormat_t(1);
+}
+impl cufftXtSubFormat_t {
+    pub const CUFFT_XT_FORMAT_INPLACE: cufftXtSubFormat_t = cufftXtSubFormat_t(2);
+}
+impl cufftXtSubFormat_t {
+    pub const CUFFT_XT_FORMAT_INPLACE_SHUFFLED: cufftXtSubFormat_t = cufftXtSubFormat_t(
+        3,
+    );
+}
+impl cufftXtSubFormat_t {
+    pub const CUFFT_XT_FORMAT_1D_INPUT_SHUFFLED: cufftXtSubFormat_t = cufftXtSubFormat_t(
+        4,
+    );
+}
+impl cufftXtSubFormat_t {
+    pub const CUFFT_XT_FORMAT_DISTRIBUTED_INPUT: cufftXtSubFormat_t = cufftXtSubFormat_t(
+        5,
+    );
+}
+impl cufftXtSubFormat_t {
+    pub const CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT: cufftXtSubFormat_t = cufftXtSubFormat_t(
+        6,
+    );
+}
+impl cufftXtSubFormat_t {
+    pub const CUFFT_FORMAT_UNDEFINED: cufftXtSubFormat_t = cufftXtSubFormat_t(7);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cufftXtSubFormat_t(pub ::core::ffi::c_uint);
+pub use self::cufftXtSubFormat_t as cufftXtSubFormat;
+impl cufftXtCopyType_t {
+    pub const CUFFT_COPY_HOST_TO_DEVICE: cufftXtCopyType_t = cufftXtCopyType_t(0);
+}
+impl cufftXtCopyType_t {
+    pub const CUFFT_COPY_DEVICE_TO_HOST: cufftXtCopyType_t = cufftXtCopyType_t(1);
+}
+impl cufftXtCopyType_t {
+    pub const CUFFT_COPY_DEVICE_TO_DEVICE: cufftXtCopyType_t = cufftXtCopyType_t(2);
+}
+impl cufftXtCopyType_t {
+    pub const CUFFT_COPY_UNDEFINED: cufftXtCopyType_t = cufftXtCopyType_t(3);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cufftXtCopyType_t(pub ::core::ffi::c_uint);
+pub use self::cufftXtCopyType_t as cufftXtCopyType;
+impl cufftXtQueryType_t {
+    pub const CUFFT_QUERY_1D_FACTORS: cufftXtQueryType_t = cufftXtQueryType_t(0);
+}
+impl cufftXtQueryType_t {
+    pub const CUFFT_QUERY_UNDEFINED: cufftXtQueryType_t = cufftXtQueryType_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cufftXtQueryType_t(pub ::core::ffi::c_uint);
+pub use self::cufftXtQueryType_t as cufftXtQueryType;
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cufftXt1dFactors_t {
+    pub size: ::core::ffi::c_longlong,
+    pub stringCount: ::core::ffi::c_longlong,
+    pub stringLength: ::core::ffi::c_longlong,
+    pub substringLength: ::core::ffi::c_longlong,
+    pub factor1: ::core::ffi::c_longlong,
+    pub factor2: ::core::ffi::c_longlong,
+    pub stringMask: ::core::ffi::c_longlong,
+    pub substringMask: ::core::ffi::c_longlong,
+    pub factor1Mask: ::core::ffi::c_longlong,
+    pub factor2Mask: ::core::ffi::c_longlong,
+    pub stringShift: ::core::ffi::c_int,
+    pub substringShift: ::core::ffi::c_int,
+    pub factor1Shift: ::core::ffi::c_int,
+    pub factor2Shift: ::core::ffi::c_int,
+}
+pub type cufftXt1dFactors = cufftXt1dFactors_t;
+impl cufftXtWorkAreaPolicy_t {
+    pub const CUFFT_WORKAREA_MINIMAL: cufftXtWorkAreaPolicy_t = cufftXtWorkAreaPolicy_t(
+        0,
+    );
+}
+impl cufftXtWorkAreaPolicy_t {
+    pub const CUFFT_WORKAREA_USER: cufftXtWorkAreaPolicy_t = cufftXtWorkAreaPolicy_t(1);
+}
+impl cufftXtWorkAreaPolicy_t {
+    pub const CUFFT_WORKAREA_PERFORMANCE: cufftXtWorkAreaPolicy_t = cufftXtWorkAreaPolicy_t(
+        2,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cufftXtWorkAreaPolicy_t(pub ::core::ffi::c_uint);
+pub use self::cufftXtWorkAreaPolicy_t as cufftXtWorkAreaPolicy;
+impl cufftXtCallbackType_t {
+    pub const CUFFT_CB_LD_COMPLEX: cufftXtCallbackType_t = cufftXtCallbackType_t(0);
+}
+impl cufftXtCallbackType_t {
+    pub const CUFFT_CB_LD_COMPLEX_DOUBLE: cufftXtCallbackType_t = cufftXtCallbackType_t(
+        1,
+    );
+}
+impl cufftXtCallbackType_t {
+    pub const CUFFT_CB_LD_REAL: cufftXtCallbackType_t = cufftXtCallbackType_t(2);
+}
+impl cufftXtCallbackType_t {
+    pub const CUFFT_CB_LD_REAL_DOUBLE: cufftXtCallbackType_t = cufftXtCallbackType_t(3);
+}
+impl cufftXtCallbackType_t {
+    pub const CUFFT_CB_ST_COMPLEX: cufftXtCallbackType_t = cufftXtCallbackType_t(4);
+}
+impl cufftXtCallbackType_t {
+    pub const CUFFT_CB_ST_COMPLEX_DOUBLE: cufftXtCallbackType_t = cufftXtCallbackType_t(
+        5,
+    );
+}
+impl cufftXtCallbackType_t {
+    pub const CUFFT_CB_ST_REAL: cufftXtCallbackType_t = cufftXtCallbackType_t(6);
+}
+impl cufftXtCallbackType_t {
+    pub const CUFFT_CB_ST_REAL_DOUBLE: cufftXtCallbackType_t = cufftXtCallbackType_t(7);
+}
+impl cufftXtCallbackType_t {
+    pub const CUFFT_CB_UNDEFINED: cufftXtCallbackType_t = cufftXtCallbackType_t(8);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cufftXtCallbackType_t(pub ::core::ffi::c_uint);
+pub use self::cufftXtCallbackType_t as cufftXtCallbackType;
+pub type cufftCallbackLoadC = ::core::option::Option<
+    unsafe extern "C" fn(
+        dataIn: *mut ::core::ffi::c_void,
+        offset: usize,
+        callerInfo: *mut ::core::ffi::c_void,
+        sharedPointer: *mut ::core::ffi::c_void,
+    ) -> cufftComplex,
+>;
+pub type cufftCallbackLoadZ = ::core::option::Option<
+    unsafe extern "C" fn(
+        dataIn: *mut ::core::ffi::c_void,
+        offset: usize,
+        callerInfo: *mut ::core::ffi::c_void,
+        sharedPointer: *mut ::core::ffi::c_void,
+    ) -> cufftDoubleComplex,
+>;
+pub type cufftCallbackLoadR = ::core::option::Option<
+    unsafe extern "C" fn(
+        dataIn: *mut ::core::ffi::c_void,
+        offset: usize,
+        callerInfo: *mut ::core::ffi::c_void,
+        sharedPointer: *mut ::core::ffi::c_void,
+    ) -> cufftReal,
+>;
+pub type cufftCallbackLoadD = ::core::option::Option<
+    unsafe extern "C" fn(
+        dataIn: *mut ::core::ffi::c_void,
+        offset: usize,
+        callerInfo: *mut ::core::ffi::c_void,
+        sharedPointer: *mut ::core::ffi::c_void,
+    ) -> cufftDoubleReal,
+>;
+pub type cufftCallbackStoreC = ::core::option::Option<
+    unsafe extern "C" fn(
+        dataOut: *mut ::core::ffi::c_void,
+        offset: usize,
+        element: cufftComplex,
+        callerInfo: *mut ::core::ffi::c_void,
+        sharedPointer: *mut ::core::ffi::c_void,
+    ),
+>;
+pub type cufftCallbackStoreZ = ::core::option::Option<
+    unsafe extern "C" fn(
+        dataOut: *mut ::core::ffi::c_void,
+        offset: usize,
+        element: cufftDoubleComplex,
+        callerInfo: *mut ::core::ffi::c_void,
+        sharedPointer: *mut ::core::ffi::c_void,
+    ),
+>;
+pub type cufftCallbackStoreR = ::core::option::Option<
+    unsafe extern "C" fn(
+        dataOut: *mut ::core::ffi::c_void,
+        offset: usize,
+        element: cufftReal,
+        callerInfo: *mut ::core::ffi::c_void,
+        sharedPointer: *mut ::core::ffi::c_void,
+    ),
+>;
+pub type cufftCallbackStoreD = ::core::option::Option<
+    unsafe extern "C" fn(
+        dataOut: *mut ::core::ffi::c_void,
+        offset: usize,
+        element: cufftDoubleReal,
+        callerInfo: *mut ::core::ffi::c_void,
+        sharedPointer: *mut ::core::ffi::c_void,
+    ),
+>;
+pub type cufftJITCallbackLoadC = ::core::option::Option<
+    unsafe extern "C" fn(
+        dataIn: *mut ::core::ffi::c_void,
+        offset: ::core::ffi::c_ulonglong,
+        callerInfo: *mut ::core::ffi::c_void,
+        sharedPointer: *mut ::core::ffi::c_void,
+    ) -> cufftComplex,
+>;
+pub type cufftJITCallbackLoadZ = ::core::option::Option<
+    unsafe extern "C" fn(
+        dataIn: *mut ::core::ffi::c_void,
+        offset: ::core::ffi::c_ulonglong,
+        callerInfo: *mut ::core::ffi::c_void,
+        sharedPointer: *mut ::core::ffi::c_void,
+    ) -> cufftDoubleComplex,
+>;
+pub type cufftJITCallbackLoadR = ::core::option::Option<
+    unsafe extern "C" fn(
+        dataIn: *mut ::core::ffi::c_void,
+        offset: ::core::ffi::c_ulonglong,
+        callerInfo: *mut ::core::ffi::c_void,
+        sharedPointer: *mut ::core::ffi::c_void,
+    ) -> cufftReal,
+>;
+pub type cufftJITCallbackLoadD = ::core::option::Option<
+    unsafe extern "C" fn(
+        dataIn: *mut ::core::ffi::c_void,
+        offset: ::core::ffi::c_ulonglong,
+        callerInfo: *mut ::core::ffi::c_void,
+        sharedPointer: *mut ::core::ffi::c_void,
+    ) -> cufftDoubleReal,
+>;
+pub type cufftJITCallbackStoreC = ::core::option::Option<
+    unsafe extern "C" fn(
+        dataOut: *mut ::core::ffi::c_void,
+        offset: ::core::ffi::c_ulonglong,
+        element: cufftComplex,
+        callerInfo: *mut ::core::ffi::c_void,
+        sharedPointer: *mut ::core::ffi::c_void,
+    ),
+>;
+pub type cufftJITCallbackStoreZ = ::core::option::Option<
+    unsafe extern "C" fn(
+        dataOut: *mut ::core::ffi::c_void,
+        offset: ::core::ffi::c_ulonglong,
+        element: cufftDoubleComplex,
+        callerInfo: *mut ::core::ffi::c_void,
+        sharedPointer: *mut ::core::ffi::c_void,
+    ),
+>;
+pub type cufftJITCallbackStoreR = ::core::option::Option<
+    unsafe extern "C" fn(
+        dataOut: *mut ::core::ffi::c_void,
+        offset: ::core::ffi::c_ulonglong,
+        element: cufftReal,
+        callerInfo: *mut ::core::ffi::c_void,
+        sharedPointer: *mut ::core::ffi::c_void,
+    ),
+>;
+pub type cufftJITCallbackStoreD = ::core::option::Option<
+    unsafe extern "C" fn(
+        dataOut: *mut ::core::ffi::c_void,
+        offset: ::core::ffi::c_ulonglong,
+        element: cufftDoubleReal,
+        callerInfo: *mut ::core::ffi::c_void,
+        sharedPointer: *mut ::core::ffi::c_void,
+    ),
+>;
diff --git a/cuda_types/src/cusparse.rs b/cuda_types/src/cusparse.rs
new file mode 100644
index 0000000..b29c207
--- /dev/null
+++ b/cuda_types/src/cusparse.rs
@@ -0,0 +1,550 @@
+// Generated automatically by zluda_bindgen
+// DO NOT EDIT MANUALLY
+#![allow(warnings)]
+pub type __half = u16;
+pub type __nv_bfloat16 = u16;
+pub use super::cuda::cuComplex;
+pub use super::cuda::cuDoubleComplex;
+pub use super::cuda::cudaDataType;
+pub use super::cuda::cudaDataType_t;
+pub type cudaStream_t = super::cuda::CUstream;
+pub use super::cuda::libraryPropertyType;
+pub type cudaGraphExecUpdateResultInfo_st = super::cuda::CUgraphExecUpdateResultInfo_st;
+pub type cudaAsyncNotificationType = super::cuda::CUasyncNotificationType_enum;
+pub type cudaGraph_t = super::cuda::CUgraph;
+pub const CUSPARSE_VER_MAJOR: u32 = 12;
+pub const CUSPARSE_VER_MINOR: u32 = 5;
+pub const CUSPARSE_VER_PATCH: u32 = 8;
+pub const CUSPARSE_VER_BUILD: u32 = 93;
+pub const CUSPARSE_VERSION: u32 = 12508;
+/// Result information returned by cudaGraphExecUpdate
+pub type cudaGraphExecUpdateResultInfo = cudaGraphExecUpdateResultInfo_st;
+/// Information describing an async notification event
+#[repr(C)]
+pub struct cudaAsyncNotificationInfo {
+    pub type_: cudaAsyncNotificationType,
+    pub info: cudaAsyncNotificationInfo__bindgen_ty_1,
+}
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub union cudaAsyncNotificationInfo__bindgen_ty_1 {
+    pub overBudget: cudaAsyncNotificationInfo__bindgen_ty_1__bindgen_ty_1,
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cudaAsyncNotificationInfo__bindgen_ty_1__bindgen_ty_1 {
+    pub bytesOverBudget: ::core::ffi::c_ulonglong,
+}
+/// Information describing an async notification event
+pub type cudaAsyncNotificationInfo_t = cudaAsyncNotificationInfo;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cusparseContext {
+    _unused: [u8; 0],
+}
+pub type cusparseHandle_t = *mut cusparseContext;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cusparseMatDescr {
+    _unused: [u8; 0],
+}
+pub type cusparseMatDescr_t = *mut cusparseMatDescr;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct bsrsv2Info {
+    _unused: [u8; 0],
+}
+pub type bsrsv2Info_t = *mut bsrsv2Info;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct bsrsm2Info {
+    _unused: [u8; 0],
+}
+pub type bsrsm2Info_t = *mut bsrsm2Info;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct csric02Info {
+    _unused: [u8; 0],
+}
+pub type csric02Info_t = *mut csric02Info;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct bsric02Info {
+    _unused: [u8; 0],
+}
+pub type bsric02Info_t = *mut bsric02Info;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct csrilu02Info {
+    _unused: [u8; 0],
+}
+pub type csrilu02Info_t = *mut csrilu02Info;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct bsrilu02Info {
+    _unused: [u8; 0],
+}
+pub type bsrilu02Info_t = *mut bsrilu02Info;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct csru2csrInfo {
+    _unused: [u8; 0],
+}
+pub type csru2csrInfo_t = *mut csru2csrInfo;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cusparseColorInfo {
+    _unused: [u8; 0],
+}
+pub type cusparseColorInfo_t = *mut cusparseColorInfo;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct pruneInfo {
+    _unused: [u8; 0],
+}
+pub type pruneInfo_t = *mut pruneInfo;
+impl cusparseStatus_t {
+    pub const CUSPARSE_STATUS_SUCCESS: cusparseStatus_t = cusparseStatus_t(0);
+}
+impl cusparseStatus_t {
+    pub const CUSPARSE_STATUS_NOT_INITIALIZED: cusparseStatus_t = cusparseStatus_t(1);
+}
+impl cusparseStatus_t {
+    pub const CUSPARSE_STATUS_ALLOC_FAILED: cusparseStatus_t = cusparseStatus_t(2);
+}
+impl cusparseStatus_t {
+    pub const CUSPARSE_STATUS_INVALID_VALUE: cusparseStatus_t = cusparseStatus_t(3);
+}
+impl cusparseStatus_t {
+    pub const CUSPARSE_STATUS_ARCH_MISMATCH: cusparseStatus_t = cusparseStatus_t(4);
+}
+impl cusparseStatus_t {
+    pub const CUSPARSE_STATUS_MAPPING_ERROR: cusparseStatus_t = cusparseStatus_t(5);
+}
+impl cusparseStatus_t {
+    pub const CUSPARSE_STATUS_EXECUTION_FAILED: cusparseStatus_t = cusparseStatus_t(6);
+}
+impl cusparseStatus_t {
+    pub const CUSPARSE_STATUS_INTERNAL_ERROR: cusparseStatus_t = cusparseStatus_t(7);
+}
+impl cusparseStatus_t {
+    pub const CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: cusparseStatus_t = cusparseStatus_t(
+        8,
+    );
+}
+impl cusparseStatus_t {
+    pub const CUSPARSE_STATUS_ZERO_PIVOT: cusparseStatus_t = cusparseStatus_t(9);
+}
+impl cusparseStatus_t {
+    pub const CUSPARSE_STATUS_NOT_SUPPORTED: cusparseStatus_t = cusparseStatus_t(10);
+}
+impl cusparseStatus_t {
+    pub const CUSPARSE_STATUS_INSUFFICIENT_RESOURCES: cusparseStatus_t = cusparseStatus_t(
+        11,
+    );
+}
+#[repr(transparent)]
+#[must_use]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseStatus_t(pub ::core::ffi::c_uint);
+impl cusparsePointerMode_t {
+    pub const CUSPARSE_POINTER_MODE_HOST: cusparsePointerMode_t = cusparsePointerMode_t(
+        0,
+    );
+}
+impl cusparsePointerMode_t {
+    pub const CUSPARSE_POINTER_MODE_DEVICE: cusparsePointerMode_t = cusparsePointerMode_t(
+        1,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparsePointerMode_t(pub ::core::ffi::c_uint);
+impl cusparseAction_t {
+    pub const CUSPARSE_ACTION_SYMBOLIC: cusparseAction_t = cusparseAction_t(0);
+}
+impl cusparseAction_t {
+    pub const CUSPARSE_ACTION_NUMERIC: cusparseAction_t = cusparseAction_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseAction_t(pub ::core::ffi::c_uint);
+impl cusparseMatrixType_t {
+    pub const CUSPARSE_MATRIX_TYPE_GENERAL: cusparseMatrixType_t = cusparseMatrixType_t(
+        0,
+    );
+}
+impl cusparseMatrixType_t {
+    pub const CUSPARSE_MATRIX_TYPE_SYMMETRIC: cusparseMatrixType_t = cusparseMatrixType_t(
+        1,
+    );
+}
+impl cusparseMatrixType_t {
+    pub const CUSPARSE_MATRIX_TYPE_HERMITIAN: cusparseMatrixType_t = cusparseMatrixType_t(
+        2,
+    );
+}
+impl cusparseMatrixType_t {
+    pub const CUSPARSE_MATRIX_TYPE_TRIANGULAR: cusparseMatrixType_t = cusparseMatrixType_t(
+        3,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseMatrixType_t(pub ::core::ffi::c_uint);
+impl cusparseFillMode_t {
+    pub const CUSPARSE_FILL_MODE_LOWER: cusparseFillMode_t = cusparseFillMode_t(0);
+}
+impl cusparseFillMode_t {
+    pub const CUSPARSE_FILL_MODE_UPPER: cusparseFillMode_t = cusparseFillMode_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseFillMode_t(pub ::core::ffi::c_uint);
+impl cusparseDiagType_t {
+    pub const CUSPARSE_DIAG_TYPE_NON_UNIT: cusparseDiagType_t = cusparseDiagType_t(0);
+}
+impl cusparseDiagType_t {
+    pub const CUSPARSE_DIAG_TYPE_UNIT: cusparseDiagType_t = cusparseDiagType_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseDiagType_t(pub ::core::ffi::c_uint);
+impl cusparseIndexBase_t {
+    pub const CUSPARSE_INDEX_BASE_ZERO: cusparseIndexBase_t = cusparseIndexBase_t(0);
+}
+impl cusparseIndexBase_t {
+    pub const CUSPARSE_INDEX_BASE_ONE: cusparseIndexBase_t = cusparseIndexBase_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseIndexBase_t(pub ::core::ffi::c_uint);
+impl cusparseOperation_t {
+    pub const CUSPARSE_OPERATION_NON_TRANSPOSE: cusparseOperation_t = cusparseOperation_t(
+        0,
+    );
+}
+impl cusparseOperation_t {
+    pub const CUSPARSE_OPERATION_TRANSPOSE: cusparseOperation_t = cusparseOperation_t(1);
+}
+impl cusparseOperation_t {
+    pub const CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE: cusparseOperation_t = cusparseOperation_t(
+        2,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseOperation_t(pub ::core::ffi::c_uint);
+impl cusparseDirection_t {
+    pub const CUSPARSE_DIRECTION_ROW: cusparseDirection_t = cusparseDirection_t(0);
+}
+impl cusparseDirection_t {
+    pub const CUSPARSE_DIRECTION_COLUMN: cusparseDirection_t = cusparseDirection_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseDirection_t(pub ::core::ffi::c_uint);
+impl cusparseSolvePolicy_t {
+    pub const CUSPARSE_SOLVE_POLICY_NO_LEVEL: cusparseSolvePolicy_t = cusparseSolvePolicy_t(
+        0,
+    );
+}
+impl cusparseSolvePolicy_t {
+    pub const CUSPARSE_SOLVE_POLICY_USE_LEVEL: cusparseSolvePolicy_t = cusparseSolvePolicy_t(
+        1,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseSolvePolicy_t(pub ::core::ffi::c_uint);
+impl cusparseColorAlg_t {
+    pub const CUSPARSE_COLOR_ALG0: cusparseColorAlg_t = cusparseColorAlg_t(0);
+}
+impl cusparseColorAlg_t {
+    pub const CUSPARSE_COLOR_ALG1: cusparseColorAlg_t = cusparseColorAlg_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseColorAlg_t(pub ::core::ffi::c_uint);
+pub type cusparseLoggerCallback_t = ::core::option::Option<
+    unsafe extern "C" fn(
+        logLevel: ::core::ffi::c_int,
+        functionName: *const ::core::ffi::c_char,
+        message: *const ::core::ffi::c_char,
+    ),
+>;
+impl cusparseCsr2CscAlg_t {
+    pub const CUSPARSE_CSR2CSC_ALG_DEFAULT: cusparseCsr2CscAlg_t = cusparseCsr2CscAlg_t(
+        1,
+    );
+}
+impl cusparseCsr2CscAlg_t {
+    pub const CUSPARSE_CSR2CSC_ALG1: cusparseCsr2CscAlg_t = cusparseCsr2CscAlg_t(1);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseCsr2CscAlg_t(pub ::core::ffi::c_uint);
+impl cusparseFormat_t {
+    ///< Compressed Sparse Row (CSR)
+    pub const CUSPARSE_FORMAT_CSR: cusparseFormat_t = cusparseFormat_t(1);
+}
+impl cusparseFormat_t {
+    ///< Compressed Sparse Column (CSC)
+    pub const CUSPARSE_FORMAT_CSC: cusparseFormat_t = cusparseFormat_t(2);
+}
+impl cusparseFormat_t {
+    ///< Coordinate (COO) - Structure of Arrays
+    pub const CUSPARSE_FORMAT_COO: cusparseFormat_t = cusparseFormat_t(3);
+}
+impl cusparseFormat_t {
+    ///< Blocked ELL
+    pub const CUSPARSE_FORMAT_BLOCKED_ELL: cusparseFormat_t = cusparseFormat_t(5);
+}
+impl cusparseFormat_t {
+    ///< Blocked Compressed Sparse Row (BSR)
+    pub const CUSPARSE_FORMAT_BSR: cusparseFormat_t = cusparseFormat_t(6);
+}
+impl cusparseFormat_t {
+    ///< Sliced ELL
+    pub const CUSPARSE_FORMAT_SLICED_ELLPACK: cusparseFormat_t = cusparseFormat_t(7);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseFormat_t(pub ::core::ffi::c_uint);
+impl cusparseOrder_t {
+    ///< Column-Major Order - Matrix memory layout
+    pub const CUSPARSE_ORDER_COL: cusparseOrder_t = cusparseOrder_t(1);
+}
+impl cusparseOrder_t {
+    ///< Row-Major Order - Matrix memory layout
+    pub const CUSPARSE_ORDER_ROW: cusparseOrder_t = cusparseOrder_t(2);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseOrder_t(pub ::core::ffi::c_uint);
+impl cusparseIndexType_t {
+    /**< 16-bit unsigned integer for matrix/vector
+< indices*/
+    pub const CUSPARSE_INDEX_16U: cusparseIndexType_t = cusparseIndexType_t(1);
+}
+impl cusparseIndexType_t {
+    ///< 32-bit signed integer for matrix/vector indices
+    pub const CUSPARSE_INDEX_32I: cusparseIndexType_t = cusparseIndexType_t(2);
+}
+impl cusparseIndexType_t {
+    ///< 64-bit signed integer for matrix/vector indices
+    pub const CUSPARSE_INDEX_64I: cusparseIndexType_t = cusparseIndexType_t(3);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseIndexType_t(pub ::core::ffi::c_uint);
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cusparseSpVecDescr {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cusparseDnVecDescr {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cusparseSpMatDescr {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cusparseDnMatDescr {
+    _unused: [u8; 0],
+}
+pub type cusparseSpVecDescr_t = *mut cusparseSpVecDescr;
+pub type cusparseDnVecDescr_t = *mut cusparseDnVecDescr;
+pub type cusparseSpMatDescr_t = *mut cusparseSpMatDescr;
+pub type cusparseDnMatDescr_t = *mut cusparseDnMatDescr;
+pub type cusparseConstSpVecDescr_t = *const cusparseSpVecDescr;
+pub type cusparseConstDnVecDescr_t = *const cusparseDnVecDescr;
+pub type cusparseConstSpMatDescr_t = *const cusparseSpMatDescr;
+pub type cusparseConstDnMatDescr_t = *const cusparseDnMatDescr;
+impl cusparseSpMatAttribute_t {
+    pub const CUSPARSE_SPMAT_FILL_MODE: cusparseSpMatAttribute_t = cusparseSpMatAttribute_t(
+        0,
+    );
+}
+impl cusparseSpMatAttribute_t {
+    pub const CUSPARSE_SPMAT_DIAG_TYPE: cusparseSpMatAttribute_t = cusparseSpMatAttribute_t(
+        1,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseSpMatAttribute_t(pub ::core::ffi::c_uint);
+impl cusparseSparseToDenseAlg_t {
+    pub const CUSPARSE_SPARSETODENSE_ALG_DEFAULT: cusparseSparseToDenseAlg_t = cusparseSparseToDenseAlg_t(
+        0,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseSparseToDenseAlg_t(pub ::core::ffi::c_uint);
+impl cusparseDenseToSparseAlg_t {
+    pub const CUSPARSE_DENSETOSPARSE_ALG_DEFAULT: cusparseDenseToSparseAlg_t = cusparseDenseToSparseAlg_t(
+        0,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseDenseToSparseAlg_t(pub ::core::ffi::c_uint);
+impl cusparseSpMVAlg_t {
+    pub const CUSPARSE_SPMV_ALG_DEFAULT: cusparseSpMVAlg_t = cusparseSpMVAlg_t(0);
+}
+impl cusparseSpMVAlg_t {
+    pub const CUSPARSE_SPMV_CSR_ALG1: cusparseSpMVAlg_t = cusparseSpMVAlg_t(2);
+}
+impl cusparseSpMVAlg_t {
+    pub const CUSPARSE_SPMV_CSR_ALG2: cusparseSpMVAlg_t = cusparseSpMVAlg_t(3);
+}
+impl cusparseSpMVAlg_t {
+    pub const CUSPARSE_SPMV_COO_ALG1: cusparseSpMVAlg_t = cusparseSpMVAlg_t(1);
+}
+impl cusparseSpMVAlg_t {
+    pub const CUSPARSE_SPMV_COO_ALG2: cusparseSpMVAlg_t = cusparseSpMVAlg_t(4);
+}
+impl cusparseSpMVAlg_t {
+    pub const CUSPARSE_SPMV_SELL_ALG1: cusparseSpMVAlg_t = cusparseSpMVAlg_t(5);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseSpMVAlg_t(pub ::core::ffi::c_uint);
+impl cusparseSpSVAlg_t {
+    pub const CUSPARSE_SPSV_ALG_DEFAULT: cusparseSpSVAlg_t = cusparseSpSVAlg_t(0);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseSpSVAlg_t(pub ::core::ffi::c_uint);
+impl cusparseSpSVUpdate_t {
+    pub const CUSPARSE_SPSV_UPDATE_GENERAL: cusparseSpSVUpdate_t = cusparseSpSVUpdate_t(
+        0,
+    );
+}
+impl cusparseSpSVUpdate_t {
+    pub const CUSPARSE_SPSV_UPDATE_DIAGONAL: cusparseSpSVUpdate_t = cusparseSpSVUpdate_t(
+        1,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseSpSVUpdate_t(pub ::core::ffi::c_uint);
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cusparseSpSVDescr {
+    _unused: [u8; 0],
+}
+pub type cusparseSpSVDescr_t = *mut cusparseSpSVDescr;
+impl cusparseSpSMAlg_t {
+    pub const CUSPARSE_SPSM_ALG_DEFAULT: cusparseSpSMAlg_t = cusparseSpSMAlg_t(0);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseSpSMAlg_t(pub ::core::ffi::c_uint);
+impl cusparseSpSMUpdate_t {
+    pub const CUSPARSE_SPSM_UPDATE_GENERAL: cusparseSpSMUpdate_t = cusparseSpSMUpdate_t(
+        0,
+    );
+}
+impl cusparseSpSMUpdate_t {
+    pub const CUSPARSE_SPSM_UPDATE_DIAGONAL: cusparseSpSMUpdate_t = cusparseSpSMUpdate_t(
+        1,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseSpSMUpdate_t(pub ::core::ffi::c_uint);
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cusparseSpSMDescr {
+    _unused: [u8; 0],
+}
+pub type cusparseSpSMDescr_t = *mut cusparseSpSMDescr;
+impl cusparseSpMMAlg_t {
+    pub const CUSPARSE_SPMM_ALG_DEFAULT: cusparseSpMMAlg_t = cusparseSpMMAlg_t(0);
+}
+impl cusparseSpMMAlg_t {
+    pub const CUSPARSE_SPMM_COO_ALG1: cusparseSpMMAlg_t = cusparseSpMMAlg_t(1);
+}
+impl cusparseSpMMAlg_t {
+    pub const CUSPARSE_SPMM_COO_ALG2: cusparseSpMMAlg_t = cusparseSpMMAlg_t(2);
+}
+impl cusparseSpMMAlg_t {
+    pub const CUSPARSE_SPMM_COO_ALG3: cusparseSpMMAlg_t = cusparseSpMMAlg_t(3);
+}
+impl cusparseSpMMAlg_t {
+    pub const CUSPARSE_SPMM_COO_ALG4: cusparseSpMMAlg_t = cusparseSpMMAlg_t(5);
+}
+impl cusparseSpMMAlg_t {
+    pub const CUSPARSE_SPMM_CSR_ALG1: cusparseSpMMAlg_t = cusparseSpMMAlg_t(4);
+}
+impl cusparseSpMMAlg_t {
+    pub const CUSPARSE_SPMM_CSR_ALG2: cusparseSpMMAlg_t = cusparseSpMMAlg_t(6);
+}
+impl cusparseSpMMAlg_t {
+    pub const CUSPARSE_SPMM_CSR_ALG3: cusparseSpMMAlg_t = cusparseSpMMAlg_t(12);
+}
+impl cusparseSpMMAlg_t {
+    pub const CUSPARSE_SPMM_BLOCKED_ELL_ALG1: cusparseSpMMAlg_t = cusparseSpMMAlg_t(13);
+}
+impl cusparseSpMMAlg_t {
+    pub const CUSPARSE_SPMM_BSR_ALG1: cusparseSpMMAlg_t = cusparseSpMMAlg_t(14);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseSpMMAlg_t(pub ::core::ffi::c_uint);
+impl cusparseSpGEMMAlg_t {
+    pub const CUSPARSE_SPGEMM_DEFAULT: cusparseSpGEMMAlg_t = cusparseSpGEMMAlg_t(0);
+}
+impl cusparseSpGEMMAlg_t {
+    pub const CUSPARSE_SPGEMM_CSR_ALG_DETERMINITIC: cusparseSpGEMMAlg_t = cusparseSpGEMMAlg_t(
+        1,
+    );
+}
+impl cusparseSpGEMMAlg_t {
+    pub const CUSPARSE_SPGEMM_CSR_ALG_NONDETERMINITIC: cusparseSpGEMMAlg_t = cusparseSpGEMMAlg_t(
+        2,
+    );
+}
+impl cusparseSpGEMMAlg_t {
+    pub const CUSPARSE_SPGEMM_ALG1: cusparseSpGEMMAlg_t = cusparseSpGEMMAlg_t(3);
+}
+impl cusparseSpGEMMAlg_t {
+    pub const CUSPARSE_SPGEMM_ALG2: cusparseSpGEMMAlg_t = cusparseSpGEMMAlg_t(4);
+}
+impl cusparseSpGEMMAlg_t {
+    pub const CUSPARSE_SPGEMM_ALG3: cusparseSpGEMMAlg_t = cusparseSpGEMMAlg_t(5);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseSpGEMMAlg_t(pub ::core::ffi::c_uint);
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cusparseSpGEMMDescr {
+    _unused: [u8; 0],
+}
+pub type cusparseSpGEMMDescr_t = *mut cusparseSpGEMMDescr;
+impl cusparseSDDMMAlg_t {
+    pub const CUSPARSE_SDDMM_ALG_DEFAULT: cusparseSDDMMAlg_t = cusparseSDDMMAlg_t(0);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseSDDMMAlg_t(pub ::core::ffi::c_uint);
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cusparseSpMMOpPlan {
+    _unused: [u8; 0],
+}
+pub type cusparseSpMMOpPlan_t = *mut cusparseSpMMOpPlan;
+impl cusparseSpMMOpAlg_t {
+    pub const CUSPARSE_SPMM_OP_ALG_DEFAULT: cusparseSpMMOpAlg_t = cusparseSpMMOpAlg_t(0);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct cusparseSpMMOpAlg_t(pub ::core::ffi::c_uint);
diff --git a/cuda_types/src/lib.rs b/cuda_types/src/lib.rs
index cd8ce24..df54dba 100644
--- a/cuda_types/src/lib.rs
+++ b/cuda_types/src/lib.rs
@@ -1,2 +1,9 @@
+pub mod cublas;
+pub mod cublaslt;
 pub mod cuda;
-pub mod nvml;
\ No newline at end of file
+pub mod cudnn;
+pub mod cudnn8;
+pub mod cudnn9;
+pub mod cufft;
+pub mod cusparse;
+pub mod nvml;
diff --git a/cuda_types/src/nvml.rs b/cuda_types/src/nvml.rs
index 525395d..d5e9896 100644
--- a/cuda_types/src/nvml.rs
+++ b/cuda_types/src/nvml.rs
@@ -1,6 +1,17 @@
 // Generated automatically by zluda_bindgen
 // DO NOT EDIT MANUALLY
 #![allow(warnings)]
+pub type __half = u16;
+pub type __nv_bfloat16 = u16;
+pub use super::cuda::cuComplex;
+pub use super::cuda::cuDoubleComplex;
+pub use super::cuda::cudaDataType;
+pub use super::cuda::cudaDataType_t;
+pub type cudaStream_t = super::cuda::CUstream;
+pub use super::cuda::libraryPropertyType;
+pub type cudaGraphExecUpdateResultInfo_st = super::cuda::CUgraphExecUpdateResultInfo_st;
+pub type cudaAsyncNotificationType = super::cuda::CUasyncNotificationType_enum;
+pub type cudaGraph_t = super::cuda::CUgraph;
 pub const NVML_API_VERSION: u32 = 12;
 pub const NVML_API_VERSION_STR: &[u8; 3] = b"12\0";
 pub const NVML_VALUE_NOT_AVAILABLE: i32 = -1;
@@ -12,34 +23,7 @@ pub const NVML_NVLINK_MAX_LINKS: u32 = 18;
 pub const NVML_MAX_PHYSICAL_BRIDGE: u32 = 128;
 pub const NVML_MAX_THERMAL_SENSORS_PER_GPU: u32 = 3;
 pub const NVML_MAX_GPU_PERF_PSTATES: u32 = 16;
-pub const NVML_GRID_LICENSE_EXPIRY_NOT_AVAILABLE: u32 = 0;
-pub const NVML_GRID_LICENSE_EXPIRY_INVALID: u32 = 1;
-pub const NVML_GRID_LICENSE_EXPIRY_VALID: u32 = 2;
-pub const NVML_GRID_LICENSE_EXPIRY_NOT_APPLICABLE: u32 = 3;
-pub const NVML_GRID_LICENSE_EXPIRY_PERMANENT: u32 = 4;
-pub const NVML_GRID_LICENSE_BUFFER_SIZE: u32 = 128;
-pub const NVML_VGPU_NAME_BUFFER_SIZE: u32 = 64;
-pub const NVML_GRID_LICENSE_FEATURE_MAX_COUNT: u32 = 3;
-pub const NVML_INVALID_VGPU_PLACEMENT_ID: u32 = 65535;
-pub const NVML_VGPU_VIRTUALIZATION_CAP_MIGRATION_NO: u32 = 0;
-pub const NVML_VGPU_VIRTUALIZATION_CAP_MIGRATION_YES: u32 = 1;
-pub const NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_NO: u32 = 0;
-pub const NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_YES: u32 = 1;
-pub const NVML_VGPU_SCHEDULER_POLICY_UNKNOWN: u32 = 0;
-pub const NVML_VGPU_SCHEDULER_POLICY_BEST_EFFORT: u32 = 1;
-pub const NVML_VGPU_SCHEDULER_POLICY_EQUAL_SHARE: u32 = 2;
-pub const NVML_VGPU_SCHEDULER_POLICY_FIXED_SHARE: u32 = 3;
-pub const NVML_SUPPORTED_VGPU_SCHEDULER_POLICY_COUNT: u32 = 3;
-pub const NVML_SCHEDULER_SW_MAX_LOG_ENTRIES: u32 = 200;
-pub const NVML_VGPU_SCHEDULER_ARR_DEFAULT: u32 = 0;
-pub const NVML_VGPU_SCHEDULER_ARR_DISABLE: u32 = 1;
-pub const NVML_VGPU_SCHEDULER_ARR_ENABLE: u32 = 2;
-pub const NVML_GRID_LICENSE_STATE_UNKNOWN: u32 = 0;
-pub const NVML_GRID_LICENSE_STATE_UNINITIALIZED: u32 = 1;
-pub const NVML_GRID_LICENSE_STATE_UNLICENSED_UNRESTRICTED: u32 = 2;
-pub const NVML_GRID_LICENSE_STATE_UNLICENSED_RESTRICTED: u32 = 3;
-pub const NVML_GRID_LICENSE_STATE_UNLICENSED: u32 = 4;
-pub const NVML_GRID_LICENSE_STATE_LICENSED: u32 = 5;
+pub const NVML_PERF_MODES_BUFFER_SIZE: u32 = 2048;
 pub const NVML_GSP_FIRMWARE_VERSION_BUF_SIZE: u32 = 64;
 pub const NVML_DEVICE_ARCH_KEPLER: u32 = 2;
 pub const NVML_DEVICE_ARCH_MAXWELL: u32 = 3;
@@ -49,6 +33,8 @@ pub const NVML_DEVICE_ARCH_TURING: u32 = 6;
 pub const NVML_DEVICE_ARCH_AMPERE: u32 = 7;
 pub const NVML_DEVICE_ARCH_ADA: u32 = 8;
 pub const NVML_DEVICE_ARCH_HOPPER: u32 = 9;
+pub const NVML_DEVICE_ARCH_BLACKWELL: u32 = 10;
+pub const NVML_DEVICE_ARCH_T23X: u32 = 11;
 pub const NVML_DEVICE_ARCH_UNKNOWN: u32 = 4294967295;
 pub const NVML_BUS_TYPE_UNKNOWN: u32 = 0;
 pub const NVML_BUS_TYPE_PCI: u32 = 1;
@@ -70,6 +56,47 @@ pub const NVML_PCIE_LINK_MAX_SPEED_64000MBPS: u32 = 6;
 pub const NVML_ADAPTIVE_CLOCKING_INFO_STATUS_DISABLED: u32 = 0;
 pub const NVML_ADAPTIVE_CLOCKING_INFO_STATUS_ENABLED: u32 = 1;
 pub const NVML_MAX_GPU_UTILIZATIONS: u32 = 8;
+pub const NVML_PCIE_ATOMICS_CAP_FETCHADD32: u32 = 1;
+pub const NVML_PCIE_ATOMICS_CAP_FETCHADD64: u32 = 2;
+pub const NVML_PCIE_ATOMICS_CAP_SWAP32: u32 = 4;
+pub const NVML_PCIE_ATOMICS_CAP_SWAP64: u32 = 8;
+pub const NVML_PCIE_ATOMICS_CAP_CAS32: u32 = 16;
+pub const NVML_PCIE_ATOMICS_CAP_CAS64: u32 = 32;
+pub const NVML_PCIE_ATOMICS_CAP_CAS128: u32 = 64;
+pub const NVML_PCIE_ATOMICS_OPS_MAX: u32 = 7;
+pub const NVML_POWER_SCOPE_GPU: u32 = 0;
+pub const NVML_POWER_SCOPE_MODULE: u32 = 1;
+pub const NVML_POWER_SCOPE_MEMORY: u32 = 2;
+pub const NVML_GRID_LICENSE_EXPIRY_NOT_AVAILABLE: u32 = 0;
+pub const NVML_GRID_LICENSE_EXPIRY_INVALID: u32 = 1;
+pub const NVML_GRID_LICENSE_EXPIRY_VALID: u32 = 2;
+pub const NVML_GRID_LICENSE_EXPIRY_NOT_APPLICABLE: u32 = 3;
+pub const NVML_GRID_LICENSE_EXPIRY_PERMANENT: u32 = 4;
+pub const NVML_GRID_LICENSE_BUFFER_SIZE: u32 = 128;
+pub const NVML_VGPU_NAME_BUFFER_SIZE: u32 = 64;
+pub const NVML_GRID_LICENSE_FEATURE_MAX_COUNT: u32 = 3;
+pub const NVML_INVALID_VGPU_PLACEMENT_ID: u32 = 65535;
+pub const NVML_VGPU_VIRTUALIZATION_CAP_MIGRATION_NO: u32 = 0;
+pub const NVML_VGPU_VIRTUALIZATION_CAP_MIGRATION_YES: u32 = 1;
+pub const NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_NO: u32 = 0;
+pub const NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_YES: u32 = 1;
+pub const NVML_VGPU_PGPU_HETEROGENEOUS_MODE: u32 = 0;
+pub const NVML_VGPU_PGPU_HOMOGENEOUS_MODE: u32 = 1;
+pub const NVML_VGPU_SCHEDULER_POLICY_UNKNOWN: u32 = 0;
+pub const NVML_VGPU_SCHEDULER_POLICY_BEST_EFFORT: u32 = 1;
+pub const NVML_VGPU_SCHEDULER_POLICY_EQUAL_SHARE: u32 = 2;
+pub const NVML_VGPU_SCHEDULER_POLICY_FIXED_SHARE: u32 = 3;
+pub const NVML_SUPPORTED_VGPU_SCHEDULER_POLICY_COUNT: u32 = 3;
+pub const NVML_SCHEDULER_SW_MAX_LOG_ENTRIES: u32 = 200;
+pub const NVML_VGPU_SCHEDULER_ARR_DEFAULT: u32 = 0;
+pub const NVML_VGPU_SCHEDULER_ARR_DISABLE: u32 = 1;
+pub const NVML_VGPU_SCHEDULER_ARR_ENABLE: u32 = 2;
+pub const NVML_GRID_LICENSE_STATE_UNKNOWN: u32 = 0;
+pub const NVML_GRID_LICENSE_STATE_UNINITIALIZED: u32 = 1;
+pub const NVML_GRID_LICENSE_STATE_UNLICENSED_UNRESTRICTED: u32 = 2;
+pub const NVML_GRID_LICENSE_STATE_UNLICENSED_RESTRICTED: u32 = 3;
+pub const NVML_GRID_LICENSE_STATE_UNLICENSED: u32 = 4;
+pub const NVML_GRID_LICENSE_STATE_LICENSED: u32 = 5;
 pub const NVML_FI_DEV_ECC_CURRENT: u32 = 1;
 pub const NVML_FI_DEV_ECC_PENDING: u32 = 2;
 pub const NVML_FI_DEV_ECC_SBE_VOL_TOTAL: u32 = 3;
@@ -266,8 +293,83 @@ pub const NVML_FI_DEV_TEMPERATURE_SHUTDOWN_TLIMIT: u32 = 193;
 pub const NVML_FI_DEV_TEMPERATURE_SLOWDOWN_TLIMIT: u32 = 194;
 pub const NVML_FI_DEV_TEMPERATURE_MEM_MAX_TLIMIT: u32 = 195;
 pub const NVML_FI_DEV_TEMPERATURE_GPU_MAX_TLIMIT: u32 = 196;
+pub const NVML_FI_DEV_PCIE_COUNT_TX_BYTES: u32 = 197;
+pub const NVML_FI_DEV_PCIE_COUNT_RX_BYTES: u32 = 198;
 pub const NVML_FI_DEV_IS_MIG_MODE_INDEPENDENT_MIG_QUERY_CAPABLE: u32 = 199;
-pub const NVML_FI_MAX: u32 = 200;
+pub const NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_MAX: u32 = 200;
+pub const NVML_FI_DEV_NVLINK_COUNT_XMIT_PACKETS: u32 = 201;
+pub const NVML_FI_DEV_NVLINK_COUNT_XMIT_BYTES: u32 = 202;
+pub const NVML_FI_DEV_NVLINK_COUNT_RCV_PACKETS: u32 = 203;
+pub const NVML_FI_DEV_NVLINK_COUNT_RCV_BYTES: u32 = 204;
+pub const NVML_FI_DEV_NVLINK_COUNT_VL15_DROPPED: u32 = 205;
+pub const NVML_FI_DEV_NVLINK_COUNT_MALFORMED_PACKET_ERRORS: u32 = 206;
+pub const NVML_FI_DEV_NVLINK_COUNT_BUFFER_OVERRUN_ERRORS: u32 = 207;
+pub const NVML_FI_DEV_NVLINK_COUNT_RCV_ERRORS: u32 = 208;
+pub const NVML_FI_DEV_NVLINK_COUNT_RCV_REMOTE_ERRORS: u32 = 209;
+pub const NVML_FI_DEV_NVLINK_COUNT_RCV_GENERAL_ERRORS: u32 = 210;
+pub const NVML_FI_DEV_NVLINK_COUNT_LOCAL_LINK_INTEGRITY_ERRORS: u32 = 211;
+pub const NVML_FI_DEV_NVLINK_COUNT_XMIT_DISCARDS: u32 = 212;
+pub const NVML_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_SUCCESSFUL_EVENTS: u32 = 213;
+pub const NVML_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_FAILED_EVENTS: u32 = 214;
+pub const NVML_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_EVENTS: u32 = 215;
+pub const NVML_FI_DEV_NVLINK_COUNT_RAW_BER_LANE0: u32 = 216;
+pub const NVML_FI_DEV_NVLINK_COUNT_RAW_BER_LANE1: u32 = 217;
+pub const NVML_FI_DEV_NVLINK_COUNT_RAW_BER: u32 = 218;
+pub const NVML_FI_DEV_NVLINK_COUNT_EFFECTIVE_ERRORS: u32 = 219;
+pub const NVML_FI_DEV_NVLINK_COUNT_EFFECTIVE_BER: u32 = 220;
+pub const NVML_FI_DEV_NVLINK_COUNT_SYMBOL_ERRORS: u32 = 221;
+pub const NVML_FI_DEV_NVLINK_COUNT_SYMBOL_BER: u32 = 222;
+pub const NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_MIN: u32 = 223;
+pub const NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_UNITS: u32 = 224;
+pub const NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_SUPPORTED: u32 = 225;
+pub const NVML_FI_DEV_RESET_STATUS: u32 = 226;
+pub const NVML_FI_DEV_DRAIN_AND_RESET_STATUS: u32 = 227;
+pub const NVML_FI_DEV_PCIE_OUTBOUND_ATOMICS_MASK: u32 = 228;
+pub const NVML_FI_DEV_PCIE_INBOUND_ATOMICS_MASK: u32 = 229;
+pub const NVML_FI_DEV_GET_GPU_RECOVERY_ACTION: u32 = 230;
+pub const NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_0: u32 = 235;
+pub const NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_1: u32 = 236;
+pub const NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_2: u32 = 237;
+pub const NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_3: u32 = 238;
+pub const NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_4: u32 = 239;
+pub const NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_5: u32 = 240;
+pub const NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_6: u32 = 241;
+pub const NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_7: u32 = 242;
+pub const NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_8: u32 = 243;
+pub const NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_9: u32 = 244;
+pub const NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_10: u32 = 245;
+pub const NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_11: u32 = 246;
+pub const NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_12: u32 = 247;
+pub const NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_13: u32 = 248;
+pub const NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_14: u32 = 249;
+pub const NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_15: u32 = 250;
+pub const NVML_FI_PWR_SMOOTHING_ENABLED: u32 = 251;
+pub const NVML_FI_PWR_SMOOTHING_PRIV_LVL: u32 = 252;
+pub const NVML_FI_PWR_SMOOTHING_IMM_RAMP_DOWN_ENABLED: u32 = 253;
+pub const NVML_FI_PWR_SMOOTHING_APPLIED_TMP_CEIL: u32 = 254;
+pub const NVML_FI_PWR_SMOOTHING_APPLIED_TMP_FLOOR: u32 = 255;
+pub const NVML_FI_PWR_SMOOTHING_MAX_PERCENT_TMP_FLOOR_SETTING: u32 = 256;
+pub const NVML_FI_PWR_SMOOTHING_MIN_PERCENT_TMP_FLOOR_SETTING: u32 = 257;
+pub const NVML_FI_PWR_SMOOTHING_HW_CIRCUITRY_PERCENT_LIFETIME_REMAINING: u32 = 258;
+pub const NVML_FI_PWR_SMOOTHING_MAX_NUM_PRESET_PROFILES: u32 = 259;
+pub const NVML_FI_PWR_SMOOTHING_PROFILE_PERCENT_TMP_FLOOR: u32 = 260;
+pub const NVML_FI_PWR_SMOOTHING_PROFILE_RAMP_UP_RATE: u32 = 261;
+pub const NVML_FI_PWR_SMOOTHING_PROFILE_RAMP_DOWN_RATE: u32 = 262;
+pub const NVML_FI_PWR_SMOOTHING_PROFILE_RAMP_DOWN_HYST_VAL: u32 = 263;
+pub const NVML_FI_PWR_SMOOTHING_ACTIVE_PRESET_PROFILE: u32 = 264;
+pub const NVML_FI_PWR_SMOOTHING_ADMIN_OVERRIDE_PERCENT_TMP_FLOOR: u32 = 265;
+pub const NVML_FI_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_UP_RATE: u32 = 266;
+pub const NVML_FI_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_DOWN_RATE: u32 = 267;
+pub const NVML_FI_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_DOWN_HYST_VAL: u32 = 268;
+pub const NVML_FI_MAX: u32 = 269;
+pub const NVML_NVLINK_LOW_POWER_THRESHOLD_UNIT_100US: u32 = 0;
+pub const NVML_NVLINK_LOW_POWER_THRESHOLD_UNIT_50US: u32 = 1;
+pub const NVML_NVLINK_POWER_STATE_HIGH_SPEED: u32 = 0;
+pub const NVML_NVLINK_POWER_STATE_LOW: u32 = 1;
+pub const NVML_NVLINK_LOW_POWER_THRESHOLD_MIN: u32 = 1;
+pub const NVML_NVLINK_LOW_POWER_THRESHOLD_MAX: u32 = 8191;
+pub const NVML_NVLINK_LOW_POWER_THRESHOLD_RESET: u32 = 4294967295;
+pub const NVML_NVLINK_LOW_POWER_THRESHOLD_DEFAULT: u32 = 4294967295;
 pub const NVML_NVFBC_SESSION_FLAG_DIFFMAP_ENABLED: u32 = 1;
 pub const NVML_NVFBC_SESSION_FLAG_CLASSIFICATIONMAP_ENABLED: u32 = 2;
 pub const NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_NO_WAIT: u32 = 4;
@@ -276,6 +378,8 @@ pub const NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_TIMEOUT: u32 = 16;
 pub const NVML_CC_SYSTEM_CPU_CAPS_NONE: u32 = 0;
 pub const NVML_CC_SYSTEM_CPU_CAPS_AMD_SEV: u32 = 1;
 pub const NVML_CC_SYSTEM_CPU_CAPS_INTEL_TDX: u32 = 2;
+pub const NVML_CC_SYSTEM_CPU_CAPS_AMD_SEV_SNP: u32 = 3;
+pub const NVML_CC_SYSTEM_CPU_CAPS_AMD_SNP_VTOM: u32 = 4;
 pub const NVML_CC_SYSTEM_GPUS_CC_NOT_CAPABLE: u32 = 0;
 pub const NVML_CC_SYSTEM_GPUS_CC_CAPABLE: u32 = 1;
 pub const NVML_CC_SYSTEM_DEVTOOLS_MODE_OFF: u32 = 0;
@@ -297,7 +401,7 @@ pub const NVML_CC_GPU_CEC_ATTESTATION_REPORT_SIZE: u32 = 4096;
 pub const NVML_CC_CEC_ATTESTATION_REPORT_NOT_PRESENT: u32 = 0;
 pub const NVML_CC_CEC_ATTESTATION_REPORT_PRESENT: u32 = 1;
 pub const NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MIN: u32 = 50;
-pub const NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MAX: u32 = 75;
+pub const NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MAX: u32 = 65;
 pub const NVML_GPU_FABRIC_UUID_LEN: u32 = 16;
 pub const NVML_GPU_FABRIC_STATE_NOT_SUPPORTED: u32 = 0;
 pub const NVML_GPU_FABRIC_STATE_NOT_STARTED: u32 = 1;
@@ -307,10 +411,22 @@ pub const NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_NOT_SUPPORTED: u32 = 0;
 pub const NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_TRUE: u32 = 1;
 pub const NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_FALSE: u32 = 2;
 pub const NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_DEGRADED_BW: u32 = 0;
-pub const NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_DEGRADED_BW: u32 = 17;
-pub const NVML_POWER_SCOPE_GPU: u32 = 0;
-pub const NVML_POWER_SCOPE_MODULE: u32 = 1;
-pub const NVML_POWER_SCOPE_MEMORY: u32 = 2;
+pub const NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_DEGRADED_BW: u32 = 3;
+pub const NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_RECOVERY_NOT_SUPPORTED: u32 = 0;
+pub const NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_RECOVERY_TRUE: u32 = 1;
+pub const NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_RECOVERY_FALSE: u32 = 2;
+pub const NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_ROUTE_RECOVERY: u32 = 2;
+pub const NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_ROUTE_RECOVERY: u32 = 3;
+pub const NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_UNHEALTHY_NOT_SUPPORTED: u32 = 0;
+pub const NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_UNHEALTHY_TRUE: u32 = 1;
+pub const NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_UNHEALTHY_FALSE: u32 = 2;
+pub const NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_ROUTE_UNHEALTHY: u32 = 4;
+pub const NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_ROUTE_UNHEALTHY: u32 = 3;
+pub const NVML_GPU_FABRIC_HEALTH_MASK_ACCESS_TIMEOUT_RECOVERY_NOT_SUPPORTED: u32 = 0;
+pub const NVML_GPU_FABRIC_HEALTH_MASK_ACCESS_TIMEOUT_RECOVERY_TRUE: u32 = 1;
+pub const NVML_GPU_FABRIC_HEALTH_MASK_ACCESS_TIMEOUT_RECOVERY_FALSE: u32 = 2;
+pub const NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_ACCESS_TIMEOUT_RECOVERY: u32 = 6;
+pub const NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_ACCESS_TIMEOUT_RECOVERY: u32 = 3;
 pub const NVML_INIT_FLAG_NO_GPUS: u32 = 1;
 pub const NVML_INIT_FLAG_NO_ATTACH: u32 = 2;
 pub const NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE: u32 = 16;
@@ -325,6 +441,14 @@ pub const NVML_DEVICE_SERIAL_BUFFER_SIZE: u32 = 30;
 pub const NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE: u32 = 32;
 pub const NVML_AFFINITY_SCOPE_NODE: u32 = 0;
 pub const NVML_AFFINITY_SCOPE_SOCKET: u32 = 1;
+pub const NVML_NVLINK_BER_MANTISSA_SHIFT: u32 = 8;
+pub const NVML_NVLINK_BER_MANTISSA_WIDTH: u32 = 15;
+pub const NVML_NVLINK_BER_EXP_SHIFT: u32 = 0;
+pub const NVML_NVLINK_BER_EXP_WIDTH: u32 = 255;
+pub const NVML_NVLINK_STATE_INACTIVE: u32 = 0;
+pub const NVML_NVLINK_STATE_ACTIVE: u32 = 1;
+pub const NVML_NVLINK_STATE_SLEEP: u32 = 2;
+pub const NVML_NVLINK_TOTAL_SUPPORTED_BW_MODES: u32 = 23;
 pub const NVML_DEVICE_MIG_DISABLE: u32 = 0;
 pub const NVML_DEVICE_MIG_ENABLE: u32 = 1;
 pub const NVML_GPU_INSTANCE_PROFILE_1_SLICE: u32 = 0;
@@ -337,8 +461,14 @@ pub const NVML_GPU_INSTANCE_PROFILE_6_SLICE: u32 = 6;
 pub const NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV1: u32 = 7;
 pub const NVML_GPU_INSTANCE_PROFILE_2_SLICE_REV1: u32 = 8;
 pub const NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV2: u32 = 9;
-pub const NVML_GPU_INSTANCE_PROFILE_COUNT: u32 = 10;
+pub const NVML_GPU_INSTANCE_PROFILE_1_SLICE_GFX: u32 = 10;
+pub const NVML_GPU_INSTANCE_PROFILE_2_SLICE_GFX: u32 = 11;
+pub const NVML_GPU_INSTANCE_PROFILE_4_SLICE_GFX: u32 = 12;
+pub const NVML_GPU_INSTANCE_PROFILE_COUNT: u32 = 13;
+pub const NVML_GPU_INSTANCE_PROFILE_CAPS_P2P: u32 = 1;
 pub const NVML_GPU_INTSTANCE_PROFILE_CAPS_P2P: u32 = 1;
+pub const NVML_GPU_INSTANCE_PROFILE_CAPS_GFX: u32 = 2;
+pub const NVML_COMPUTE_INSTANCE_PROFILE_CAPS_GFX: u32 = 1;
 pub const NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE: u32 = 0;
 pub const NVML_COMPUTE_INSTANCE_PROFILE_2_SLICE: u32 = 1;
 pub const NVML_COMPUTE_INSTANCE_PROFILE_3_SLICE: u32 = 2;
@@ -352,11 +482,17 @@ pub const NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED: u32 = 0;
 pub const NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT: u32 = 1;
 pub const NVML_GPM_METRICS_GET_VERSION: u32 = 1;
 pub const NVML_GPM_SUPPORT_VERSION: u32 = 1;
-pub const NVML_NVLINK_POWER_STATE_HIGH_SPEED: u32 = 0;
-pub const NVML_NVLINK_POWER_STATE_LOW: u32 = 1;
-pub const NVML_NVLINK_LOW_POWER_THRESHOLD_MIN: u32 = 1;
-pub const NVML_NVLINK_LOW_POWER_THRESHOLD_MAX: u32 = 8191;
-pub const NVML_NVLINK_LOW_POWER_THRESHOLD_RESET: u32 = 4294967295;
+pub const NVML_DEV_CAP_EGM: u32 = 1;
+pub const NVML_255_MASK_BITS_PER_ELEM: u32 = 32;
+pub const NVML_255_MASK_NUM_ELEMS: u32 = 8;
+pub const NVML_WORKLOAD_POWER_MAX_PROFILES: u32 = 255;
+pub const NVML_POWER_SMOOTHING_MAX_NUM_PROFILES: u32 = 5;
+pub const NVML_POWER_SMOOTHING_NUM_PROFILE_PARAMS: u32 = 4;
+pub const NVML_POWER_SMOOTHING_ADMIN_OVERRIDE_NOT_SET: u32 = 4294967295;
+pub const NVML_POWER_SMOOTHING_PROFILE_PARAM_PERCENT_TMP_FLOOR: u32 = 0;
+pub const NVML_POWER_SMOOTHING_PROFILE_PARAM_RAMP_UP_RATE: u32 = 1;
+pub const NVML_POWER_SMOOTHING_PROFILE_PARAM_RAMP_DOWN_RATE: u32 = 2;
+pub const NVML_POWER_SMOOTHING_PROFILE_PARAM_RAMP_DOWN_HYSTERESIS: u32 = 3;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct nvmlDevice_st {
@@ -1054,7 +1190,10 @@ impl nvmlValueType_enum {
     pub const NVML_VALUE_TYPE_SIGNED_INT: nvmlValueType_enum = nvmlValueType_enum(5);
 }
 impl nvmlValueType_enum {
-    pub const NVML_VALUE_TYPE_COUNT: nvmlValueType_enum = nvmlValueType_enum(6);
+    pub const NVML_VALUE_TYPE_UNSIGNED_SHORT: nvmlValueType_enum = nvmlValueType_enum(6);
+}
+impl nvmlValueType_enum {
+    pub const NVML_VALUE_TYPE_COUNT: nvmlValueType_enum = nvmlValueType_enum(7);
 }
 #[repr(transparent)]
 /// Represents the type for sample value returned
@@ -1078,6 +1217,8 @@ pub union nvmlValue_st {
     pub ullVal: ::core::ffi::c_ulonglong,
     ///!< If the value is signed long long
     pub sllVal: ::core::ffi::c_longlong,
+    ///!< If the value is unsigned short
+    pub usVal: ::core::ffi::c_ushort,
 }
 /// Union to represent different types of Value
 pub type nvmlValue_t = nvmlValue_st;
@@ -1208,6 +1349,7 @@ impl nvmlThermalTarget_t {
     pub const NVML_THERMAL_TARGET_UNKNOWN: nvmlThermalTarget_t = nvmlThermalTarget_t(-1);
 }
 #[repr(transparent)]
+/// Represents the thermal sensor targets
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct nvmlThermalTarget_t(pub ::core::ffi::c_int);
 impl nvmlThermalController_t {
@@ -1306,8 +1448,10 @@ impl nvmlThermalController_t {
     );
 }
 #[repr(transparent)]
+/// Represents the thermal sensor controllers
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct nvmlThermalController_t(pub ::core::ffi::c_int);
+/// Struct to hold the thermal sensor settings
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct nvmlGpuThermalSettings_t {
@@ -1323,6 +1467,84 @@ pub struct nvmlGpuThermalSettings_t__bindgen_ty_1 {
     pub currentTemp: ::core::ffi::c_int,
     pub target: nvmlThermalTarget_t,
 }
+impl nvmlCoolerControl_enum {
+    ///!< This cooler has no control signal.
+    pub const NVML_THERMAL_COOLER_SIGNAL_NONE: nvmlCoolerControl_enum = nvmlCoolerControl_enum(
+        0,
+    );
+}
+impl nvmlCoolerControl_enum {
+    ///!< This cooler can only be toggled either ON or OFF (eg a switch).
+    pub const NVML_THERMAL_COOLER_SIGNAL_TOGGLE: nvmlCoolerControl_enum = nvmlCoolerControl_enum(
+        1,
+    );
+}
+impl nvmlCoolerControl_enum {
+    ///!< This cooler's level can be adjusted from some minimum to some maximum (eg a knob).
+    pub const NVML_THERMAL_COOLER_SIGNAL_VARIABLE: nvmlCoolerControl_enum = nvmlCoolerControl_enum(
+        2,
+    );
+}
+impl nvmlCoolerControl_enum {
+    pub const NVML_THERMAL_COOLER_SIGNAL_COUNT: nvmlCoolerControl_enum = nvmlCoolerControl_enum(
+        3,
+    );
+}
+#[repr(transparent)]
+/// Cooler control type
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlCoolerControl_enum(pub ::core::ffi::c_uint);
+/// Cooler control type
+pub use self::nvmlCoolerControl_enum as nvmlCoolerControl_t;
+impl nvmlCoolerTarget_enum {
+    ///!< This cooler cools nothing.
+    pub const NVML_THERMAL_COOLER_TARGET_NONE: nvmlCoolerTarget_enum = nvmlCoolerTarget_enum(
+        1,
+    );
+}
+impl nvmlCoolerTarget_enum {
+    ///!< This cooler can cool the GPU.
+    pub const NVML_THERMAL_COOLER_TARGET_GPU: nvmlCoolerTarget_enum = nvmlCoolerTarget_enum(
+        2,
+    );
+}
+impl nvmlCoolerTarget_enum {
+    ///!< This cooler can cool the memory.
+    pub const NVML_THERMAL_COOLER_TARGET_MEMORY: nvmlCoolerTarget_enum = nvmlCoolerTarget_enum(
+        4,
+    );
+}
+impl nvmlCoolerTarget_enum {
+    ///!< This cooler can cool the power supply.
+    pub const NVML_THERMAL_COOLER_TARGET_POWER_SUPPLY: nvmlCoolerTarget_enum = nvmlCoolerTarget_enum(
+        8,
+    );
+}
+impl nvmlCoolerTarget_enum {
+    ///!< This cooler cools all of the components related to its target gpu. GPU_RELATED = GPU | MEMORY | POWER_SUPPLY
+    pub const NVML_THERMAL_COOLER_TARGET_GPU_RELATED: nvmlCoolerTarget_enum = nvmlCoolerTarget_enum(
+        14,
+    );
+}
+#[repr(transparent)]
+/// Cooler's target
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlCoolerTarget_enum(pub ::core::ffi::c_uint);
+/// Cooler's target
+pub use self::nvmlCoolerTarget_enum as nvmlCoolerTarget_t;
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlCoolerInfo_v1_t {
+    ///!< the API version number
+    pub version: ::core::ffi::c_uint,
+    ///!< the cooler index
+    pub index: ::core::ffi::c_uint,
+    ///!< OUT: the cooler's control signal characteristics
+    pub signalType: nvmlCoolerControl_t,
+    ///!< OUT: the target that cooler cools
+    pub target: nvmlCoolerTarget_t,
+}
+pub type nvmlCoolerInfo_t = nvmlCoolerInfo_v1_t;
 impl nvmlEnableState_enum {
     ///!< Feature disabled
     pub const NVML_FEATURE_DISABLED: nvmlEnableState_enum = nvmlEnableState_enum(0);
@@ -1337,6 +1559,17 @@ impl nvmlEnableState_enum {
 pub struct nvmlEnableState_enum(pub ::core::ffi::c_uint);
 /// Generic enable/disable enum.
 pub use self::nvmlEnableState_enum as nvmlEnableState_t;
+/// DRAM Encryption Info
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlDramEncryptionInfo_v1_t {
+    ///!< IN - the API version number
+    pub version: ::core::ffi::c_uint,
+    ///!< IN/OUT - DRAM Encryption state
+    pub encryptionState: nvmlEnableState_t,
+}
+/// DRAM Encryption Info
+pub type nvmlDramEncryptionInfo_t = nvmlDramEncryptionInfo_v1_t;
 impl nvmlBrandType_enum {
     pub const NVML_BRAND_UNKNOWN: nvmlBrandType_enum = nvmlBrandType_enum(0);
 }
@@ -1438,10 +1671,15 @@ impl nvmlTemperatureThresholds_enum {
     );
 }
 impl nvmlTemperatureThresholds_enum {
-    pub const NVML_TEMPERATURE_THRESHOLD_COUNT: nvmlTemperatureThresholds_enum = nvmlTemperatureThresholds_enum(
+    pub const NVML_TEMPERATURE_THRESHOLD_GPS_CURR: nvmlTemperatureThresholds_enum = nvmlTemperatureThresholds_enum(
         7,
     );
 }
+impl nvmlTemperatureThresholds_enum {
+    pub const NVML_TEMPERATURE_THRESHOLD_COUNT: nvmlTemperatureThresholds_enum = nvmlTemperatureThresholds_enum(
+        8,
+    );
+}
 #[repr(transparent)]
 /// Temperature thresholds.
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
@@ -1465,6 +1703,17 @@ impl nvmlTemperatureSensors_enum {
 pub struct nvmlTemperatureSensors_enum(pub ::core::ffi::c_uint);
 /// Temperature sensors.
 pub use self::nvmlTemperatureSensors_enum as nvmlTemperatureSensors_t;
+/// Margin temperature values
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlMarginTemperature_v1_t {
+    ///!< The version number of this struct
+    pub version: ::core::ffi::c_uint,
+    ///!< The margin temperature value
+    pub marginTemperature: ::core::ffi::c_int,
+}
+/// Margin temperature values
+pub type nvmlMarginTemperature_t = nvmlMarginTemperature_v1_t;
 impl nvmlComputeMode_enum {
     ///!< Default compute mode -- multiple contexts per device
     pub const NVML_COMPUTEMODE_DEFAULT: nvmlComputeMode_enum = nvmlComputeMode_enum(0);
@@ -1558,6 +1807,52 @@ impl nvmlMemoryErrorType_enum {
 pub struct nvmlMemoryErrorType_enum(pub ::core::ffi::c_uint);
 /// Memory error types
 pub use self::nvmlMemoryErrorType_enum as nvmlMemoryErrorType_t;
+impl nvmlNvlinkVersion_enum {
+    pub const NVML_NVLINK_VERSION_INVALID: nvmlNvlinkVersion_enum = nvmlNvlinkVersion_enum(
+        0,
+    );
+}
+impl nvmlNvlinkVersion_enum {
+    pub const NVML_NVLINK_VERSION_1_0: nvmlNvlinkVersion_enum = nvmlNvlinkVersion_enum(
+        1,
+    );
+}
+impl nvmlNvlinkVersion_enum {
+    pub const NVML_NVLINK_VERSION_2_0: nvmlNvlinkVersion_enum = nvmlNvlinkVersion_enum(
+        2,
+    );
+}
+impl nvmlNvlinkVersion_enum {
+    pub const NVML_NVLINK_VERSION_2_2: nvmlNvlinkVersion_enum = nvmlNvlinkVersion_enum(
+        3,
+    );
+}
+impl nvmlNvlinkVersion_enum {
+    pub const NVML_NVLINK_VERSION_3_0: nvmlNvlinkVersion_enum = nvmlNvlinkVersion_enum(
+        4,
+    );
+}
+impl nvmlNvlinkVersion_enum {
+    pub const NVML_NVLINK_VERSION_3_1: nvmlNvlinkVersion_enum = nvmlNvlinkVersion_enum(
+        5,
+    );
+}
+impl nvmlNvlinkVersion_enum {
+    pub const NVML_NVLINK_VERSION_4_0: nvmlNvlinkVersion_enum = nvmlNvlinkVersion_enum(
+        6,
+    );
+}
+impl nvmlNvlinkVersion_enum {
+    pub const NVML_NVLINK_VERSION_5_0: nvmlNvlinkVersion_enum = nvmlNvlinkVersion_enum(
+        7,
+    );
+}
+#[repr(transparent)]
+/// Represents Nvlink Version
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlNvlinkVersion_enum(pub ::core::ffi::c_uint);
+/// Represents Nvlink Version
+pub use self::nvmlNvlinkVersion_enum as nvmlNvlinkVersion_t;
 impl nvmlEccCounterType_enum {
     ///!< Volatile counts are reset each time the driver loads.
     pub const NVML_VOLATILE_ECC: nvmlEccCounterType_enum = nvmlEccCounterType_enum(0);
@@ -1651,9 +1946,13 @@ impl nvmlDriverModel_enum {
     pub const NVML_DRIVER_WDDM: nvmlDriverModel_enum = nvmlDriverModel_enum(0);
 }
 impl nvmlDriverModel_enum {
-    ///!< WDM (TCC) model (recommended) -- GPU treated as a generic device
+    ///!< WDM (TCC) model (deprecated) -- GPU treated as a generic compute device
     pub const NVML_DRIVER_WDM: nvmlDriverModel_enum = nvmlDriverModel_enum(1);
 }
+impl nvmlDriverModel_enum {
+    ///!< MCDM driver model -- GPU treated as a Microsoft compute device
+    pub const NVML_DRIVER_MCDM: nvmlDriverModel_enum = nvmlDriverModel_enum(2);
+}
 #[repr(transparent)]
 /** Driver models.
 
@@ -1738,6 +2037,55 @@ impl nvmlPStates_enum {
 pub struct nvmlPStates_enum(pub ::core::ffi::c_uint);
 /// Allowed PStates.
 pub use self::nvmlPStates_enum as nvmlPstates_t;
+/// Clock offset info.
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlClockOffset_v1_t {
+    ///!< The version number of this struct
+    pub version: ::core::ffi::c_uint,
+    pub type_: nvmlClockType_t,
+    pub pstate: nvmlPstates_t,
+    pub clockOffsetMHz: ::core::ffi::c_int,
+    pub minClockOffsetMHz: ::core::ffi::c_int,
+    pub maxClockOffsetMHz: ::core::ffi::c_int,
+}
+/// Clock offset info.
+pub type nvmlClockOffset_t = nvmlClockOffset_v1_t;
+/// Fan speed info.
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlFanSpeedInfo_v1_t {
+    ///!< the API version number
+    pub version: ::core::ffi::c_uint,
+    ///!< the fan index
+    pub fan: ::core::ffi::c_uint,
+    ///!< OUT: the fan speed in RPM
+    pub speed: ::core::ffi::c_uint,
+}
+/// Fan speed info.
+pub type nvmlFanSpeedInfo_t = nvmlFanSpeedInfo_v1_t;
+/// Device performance modes string
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlDevicePerfModes_v1_t {
+    ///!< the API version number
+    pub version: ::core::ffi::c_uint,
+    ///!< OUT: the performance modes string.
+    pub str_: [::core::ffi::c_char; 2048usize],
+}
+/// Device performance modes string
+pub type nvmlDevicePerfModes_t = nvmlDevicePerfModes_v1_t;
+/// Device current clocks string
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlDeviceCurrentClockFreqs_v1_t {
+    ///!< the API version number
+    pub version: ::core::ffi::c_uint,
+    ///!< OUT: the current clock frequency string.
+    pub str_: [::core::ffi::c_char; 2048usize],
+}
+/// Device current clocks string
+pub type nvmlDeviceCurrentClockFreqs_t = nvmlDeviceCurrentClockFreqs_v1_t;
 impl nvmlGom_enum {
     ///!< Everything is enabled and running at full speed
     pub const NVML_GOM_ALL_ON: nvmlGom_enum = nvmlGom_enum(0);
@@ -1778,9 +2126,13 @@ impl nvmlInforomObject_enum {
     ///!< The power management object
     pub const NVML_INFOROM_POWER: nvmlInforomObject_enum = nvmlInforomObject_enum(2);
 }
+impl nvmlInforomObject_enum {
+    ///!< DRAM Encryption object
+    pub const NVML_INFOROM_DEN: nvmlInforomObject_enum = nvmlInforomObject_enum(3);
+}
 impl nvmlInforomObject_enum {
     ///!< This counts the number of infoROM objects the driver knows about
-    pub const NVML_INFOROM_COUNT: nvmlInforomObject_enum = nvmlInforomObject_enum(3);
+    pub const NVML_INFOROM_COUNT: nvmlInforomObject_enum = nvmlInforomObject_enum(4);
 }
 #[repr(transparent)]
 /// Available infoROM objects.
@@ -1904,6 +2256,203 @@ impl nvmlRestrictedAPI_enum {
 pub struct nvmlRestrictedAPI_enum(pub ::core::ffi::c_uint);
 /// API types that allow changes to default permission restrictions
 pub use self::nvmlRestrictedAPI_enum as nvmlRestrictedAPI_t;
+/// Structure to store utilization value and process Id
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlProcessUtilizationSample_st {
+    ///!< PID of process
+    pub pid: ::core::ffi::c_uint,
+    ///!< CPU Timestamp in microseconds
+    pub timeStamp: ::core::ffi::c_ulonglong,
+    ///!< SM (3D/Compute) Util Value
+    pub smUtil: ::core::ffi::c_uint,
+    ///!< Frame Buffer Memory Util Value
+    pub memUtil: ::core::ffi::c_uint,
+    ///!< Encoder Util Value
+    pub encUtil: ::core::ffi::c_uint,
+    ///!< Decoder Util Value
+    pub decUtil: ::core::ffi::c_uint,
+}
+/// Structure to store utilization value and process Id
+pub type nvmlProcessUtilizationSample_t = nvmlProcessUtilizationSample_st;
+/// Structure to store utilization value and process Id -- version 1
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlProcessUtilizationInfo_v1_t {
+    ///!< CPU Timestamp in microseconds
+    pub timeStamp: ::core::ffi::c_ulonglong,
+    ///!< PID of process
+    pub pid: ::core::ffi::c_uint,
+    ///!< SM (3D/Compute) Util Value
+    pub smUtil: ::core::ffi::c_uint,
+    ///!< Frame Buffer Memory Util Value
+    pub memUtil: ::core::ffi::c_uint,
+    ///!< Encoder Util Value
+    pub encUtil: ::core::ffi::c_uint,
+    ///!< Decoder Util Value
+    pub decUtil: ::core::ffi::c_uint,
+    ///!< Jpeg Util Value
+    pub jpgUtil: ::core::ffi::c_uint,
+    ///!< Ofa Util Value
+    pub ofaUtil: ::core::ffi::c_uint,
+}
+/// Structure to store utilization and process ID for each running process -- version 1
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlProcessesUtilizationInfo_v1_t {
+    ///!< The version number of this struct
+    pub version: ::core::ffi::c_uint,
+    ///!< Caller-supplied array size, and returns number of processes running
+    pub processSamplesCount: ::core::ffi::c_uint,
+    ///!< Return only samples with timestamp greater than lastSeenTimeStamp
+    pub lastSeenTimeStamp: ::core::ffi::c_ulonglong,
+    ///!< The array (allocated by caller) of the utilization of GPU SM, framebuffer, video encoder, video decoder, JPEG, and OFA
+    pub procUtilArray: *mut nvmlProcessUtilizationInfo_v1_t,
+}
+/// Structure to store utilization and process ID for each running process -- version 1
+pub type nvmlProcessesUtilizationInfo_t = nvmlProcessesUtilizationInfo_v1_t;
+/// Structure to store SRAM uncorrectable error counters
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlEccSramErrorStatus_v1_t {
+    ///!< the API version number
+    pub version: ::core::ffi::c_uint,
+    ///!< aggregate uncorrectable parity error count
+    pub aggregateUncParity: ::core::ffi::c_ulonglong,
+    ///!< aggregate uncorrectable SEC-DED error count
+    pub aggregateUncSecDed: ::core::ffi::c_ulonglong,
+    ///!< aggregate correctable error count
+    pub aggregateCor: ::core::ffi::c_ulonglong,
+    ///!< volatile uncorrectable parity error count
+    pub volatileUncParity: ::core::ffi::c_ulonglong,
+    ///!< volatile uncorrectable SEC-DED error count
+    pub volatileUncSecDed: ::core::ffi::c_ulonglong,
+    ///!< volatile correctable error count
+    pub volatileCor: ::core::ffi::c_ulonglong,
+    ///!< aggregate uncorrectable error count for L2 cache bucket
+    pub aggregateUncBucketL2: ::core::ffi::c_ulonglong,
+    ///!< aggregate uncorrectable error count for SM bucket
+    pub aggregateUncBucketSm: ::core::ffi::c_ulonglong,
+    ///!< aggregate uncorrectable error count for PCIE bucket
+    pub aggregateUncBucketPcie: ::core::ffi::c_ulonglong,
+    ///!< aggregate uncorrectable error count for Microcontroller bucket
+    pub aggregateUncBucketMcu: ::core::ffi::c_ulonglong,
+    ///!< aggregate uncorrectable error count for Other bucket
+    pub aggregateUncBucketOther: ::core::ffi::c_ulonglong,
+    ///!< if the error threshold of field diag is exceeded
+    pub bThresholdExceeded: ::core::ffi::c_uint,
+}
+/// Structure to store SRAM uncorrectable error counters
+pub type nvmlEccSramErrorStatus_t = nvmlEccSramErrorStatus_v1_t;
+/** Structure to store platform information
+
+ @deprecated  The nvmlPlatformInfo_v1_t will be deprecated in the subsequent releases.
+              Use nvmlPlatformInfo_v2_t*/
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlPlatformInfo_v1_t {
+    ///!< the API version number
+    pub version: ::core::ffi::c_uint,
+    ///!< Infiniband GUID reported by platform (for Blackwell, ibGuid is 8 bytes so indices 8-15 are zero)
+    pub ibGuid: [::core::ffi::c_uchar; 16usize],
+    ///!< GUID of the rack containing this GPU (for Blackwell rackGuid is 13 bytes so indices 13-15 are zero)
+    pub rackGuid: [::core::ffi::c_uchar; 16usize],
+    ///!< The slot number in the rack containing this GPU (includes switches)
+    pub chassisPhysicalSlotNumber: ::core::ffi::c_uchar,
+    ///!< The index within the compute slots in the rack containing this GPU (does not include switches)
+    pub computeSlotIndex: ::core::ffi::c_uchar,
+    ///!< Index of the node within the slot containing this GPU
+    pub nodeIndex: ::core::ffi::c_uchar,
+    ///!< Platform indicated NVLink-peer type (e.g. switch present or not)
+    pub peerType: ::core::ffi::c_uchar,
+    ///!< ID of this GPU within the node
+    pub moduleId: ::core::ffi::c_uchar,
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlPlatformInfo_v2_t {
+    ///!< the API version number
+    pub version: ::core::ffi::c_uint,
+    ///!< Infiniband GUID reported by platform (for Blackwell, ibGuid is 8 bytes so indices 8-15 are zero)
+    pub ibGuid: [::core::ffi::c_uchar; 16usize],
+    ///!< Serial number of the chassis containing this GPU (for Blackwell it is 13 bytes so indices 13-15 are zero)
+    pub chassisSerialNumber: [::core::ffi::c_uchar; 16usize],
+    ///!< The slot number in the chassis containing this GPU (includes switches)
+    pub slotNumber: ::core::ffi::c_uchar,
+    ///!< The tray index within the compute slots in the chassis containing this GPU (does not include switches)
+    pub trayIndex: ::core::ffi::c_uchar,
+    ///!< Index of the node within the slot containing this GPU
+    pub hostId: ::core::ffi::c_uchar,
+    ///!< Platform indicated NVLink-peer type (e.g. switch present or not)
+    pub peerType: ::core::ffi::c_uchar,
+    ///!< ID of this GPU within the node
+    pub moduleId: ::core::ffi::c_uchar,
+}
+pub type nvmlPlatformInfo_t = nvmlPlatformInfo_v2_t;
+pub type nvmlDeviceArchitecture_t = ::core::ffi::c_uint;
+pub type nvmlBusType_t = ::core::ffi::c_uint;
+pub type nvmlFanControlPolicy_t = ::core::ffi::c_uint;
+pub type nvmlPowerSource_t = ::core::ffi::c_uint;
+impl nvmlGpuUtilizationDomainId_t {
+    ///!< Graphics engine domain
+    pub const NVML_GPU_UTILIZATION_DOMAIN_GPU: nvmlGpuUtilizationDomainId_t = nvmlGpuUtilizationDomainId_t(
+        0,
+    );
+}
+impl nvmlGpuUtilizationDomainId_t {
+    ///!< Frame buffer domain
+    pub const NVML_GPU_UTILIZATION_DOMAIN_FB: nvmlGpuUtilizationDomainId_t = nvmlGpuUtilizationDomainId_t(
+        1,
+    );
+}
+impl nvmlGpuUtilizationDomainId_t {
+    ///!< Video engine domain
+    pub const NVML_GPU_UTILIZATION_DOMAIN_VID: nvmlGpuUtilizationDomainId_t = nvmlGpuUtilizationDomainId_t(
+        2,
+    );
+}
+impl nvmlGpuUtilizationDomainId_t {
+    ///!< Bus interface domain
+    pub const NVML_GPU_UTILIZATION_DOMAIN_BUS: nvmlGpuUtilizationDomainId_t = nvmlGpuUtilizationDomainId_t(
+        3,
+    );
+}
+#[repr(transparent)]
+/// Represents the GPU utilization domains
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlGpuUtilizationDomainId_t(pub ::core::ffi::c_uint);
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlGpuDynamicPstatesInfo_st {
+    ///!< Reserved for future use
+    pub flags: ::core::ffi::c_uint,
+    pub utilization: [nvmlGpuDynamicPstatesInfo_st__bindgen_ty_1; 8usize],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlGpuDynamicPstatesInfo_st__bindgen_ty_1 {
+    ///!< Set if this utilization domain is present on this GPU
+    pub bIsPresent: ::core::ffi::c_uint,
+    ///!< Percentage of time where the domain is considered busy in the last 1-second interval
+    pub percentage: ::core::ffi::c_uint,
+    ///!< Utilization threshold that can trigger a perf-increasing P-State change when crossed
+    pub incThreshold: ::core::ffi::c_uint,
+    ///!< Utilization threshold that can trigger a perf-decreasing P-State change when crossed
+    pub decThreshold: ::core::ffi::c_uint,
+}
+pub type nvmlGpuDynamicPstatesInfo_t = nvmlGpuDynamicPstatesInfo_st;
+pub type nvmlPowerScopeType_t = ::core::ffi::c_uchar;
+/// Contains the power management limit
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlPowerValue_v2_t {
+    ///!< Structure format version (must be 1)
+    pub version: ::core::ffi::c_uint,
+    ///!< [in]  Device type: GPU or Total Module
+    pub powerScope: nvmlPowerScopeType_t,
+    ///!< [out] Power value to retrieve or set in milliwatts
+    pub powerValueMw: ::core::ffi::c_uint,
+}
 impl nvmlGpuVirtualizationMode {
     ///!< Represents Bare Metal GPU
     pub const NVML_GPU_VIRTUALIZATION_MODE_NONE: nvmlGpuVirtualizationMode = nvmlGpuVirtualizationMode(
@@ -2076,10 +2625,16 @@ impl nvmlVgpuDriverCapability_enum {
     );
 }
 impl nvmlVgpuDriverCapability_enum {
-    pub const NVML_VGPU_DRIVER_CAP_COUNT: nvmlVgpuDriverCapability_enum = nvmlVgpuDriverCapability_enum(
+    ///!< Supports FSR and warm update of vGPU host driver without terminating the running guest VM
+    pub const NVML_VGPU_DRIVER_CAP_WARM_UPDATE: nvmlVgpuDriverCapability_enum = nvmlVgpuDriverCapability_enum(
         1,
     );
 }
+impl nvmlVgpuDriverCapability_enum {
+    pub const NVML_VGPU_DRIVER_CAP_COUNT: nvmlVgpuDriverCapability_enum = nvmlVgpuDriverCapability_enum(
+        2,
+    );
+}
 #[repr(transparent)]
 /// vGPU driver queryable capabilities
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
@@ -2087,19 +2642,19 @@ pub struct nvmlVgpuDriverCapability_enum(pub ::core::ffi::c_uint);
 /// vGPU driver queryable capabilities
 pub use self::nvmlVgpuDriverCapability_enum as nvmlVgpuDriverCapability_t;
 impl nvmlDeviceVgpuCapability_enum {
-    ///!< Query if the fractional vGPU profiles on this GPU can be used in multi-vGPU configurations
+    ///!< Query whether the fractional vGPU profiles on this GPU can be used in multi-vGPU configurations
     pub const NVML_DEVICE_VGPU_CAP_FRACTIONAL_MULTI_VGPU: nvmlDeviceVgpuCapability_enum = nvmlDeviceVgpuCapability_enum(
         0,
     );
 }
 impl nvmlDeviceVgpuCapability_enum {
-    ///!< Query if the GPU support concurrent execution of timesliced vGPU profiles of differing types
+    ///!< Query whether the GPU support concurrent execution of timesliced vGPU profiles of differing types
     pub const NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_PROFILES: nvmlDeviceVgpuCapability_enum = nvmlDeviceVgpuCapability_enum(
         1,
     );
 }
 impl nvmlDeviceVgpuCapability_enum {
-    ///!< Query if the GPU support concurrent execution of timesliced vGPU profiles of differing framebuffer sizes
+    ///!< Query whether the GPU support concurrent execution of timesliced vGPU profiles of differing framebuffer sizes
     pub const NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_SIZES: nvmlDeviceVgpuCapability_enum = nvmlDeviceVgpuCapability_enum(
         2,
     );
@@ -2117,7 +2672,7 @@ impl nvmlDeviceVgpuCapability_enum {
     );
 }
 impl nvmlDeviceVgpuCapability_enum {
-    ///!< Query if vGPU profiles on the GPU supports migration data streaming
+    ///!< Query whether the vGPU profiles on the GPU supports migration data streaming
     pub const NVML_DEVICE_VGPU_CAP_DEVICE_STREAMING: nvmlDeviceVgpuCapability_enum = nvmlDeviceVgpuCapability_enum(
         5,
     );
@@ -2135,10 +2690,22 @@ impl nvmlDeviceVgpuCapability_enum {
     );
 }
 impl nvmlDeviceVgpuCapability_enum {
-    pub const NVML_DEVICE_VGPU_CAP_COUNT: nvmlDeviceVgpuCapability_enum = nvmlDeviceVgpuCapability_enum(
+    ///!< Query whether the GPU supports FSR and warm update
+    pub const NVML_DEVICE_VGPU_CAP_WARM_UPDATE: nvmlDeviceVgpuCapability_enum = nvmlDeviceVgpuCapability_enum(
         8,
     );
 }
+impl nvmlDeviceVgpuCapability_enum {
+    ///!< Query whether the GPU supports reporting of placements of timesliced vGPU profiles with identical framebuffer sizes
+    pub const NVML_DEVICE_VGPU_CAP_HOMOGENEOUS_PLACEMENTS: nvmlDeviceVgpuCapability_enum = nvmlDeviceVgpuCapability_enum(
+        9,
+    );
+}
+impl nvmlDeviceVgpuCapability_enum {
+    pub const NVML_DEVICE_VGPU_CAP_COUNT: nvmlDeviceVgpuCapability_enum = nvmlDeviceVgpuCapability_enum(
+        10,
+    );
+}
 #[repr(transparent)]
 /// Device vGPU queryable capabilities
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
@@ -2183,8 +2750,34 @@ pub struct nvmlVgpuPlacementList_v1_t {
     ///!< Placement IDs for the vGPU type
     pub placementIds: *mut ::core::ffi::c_uint,
 }
-/// Structure to store the list of vGPU placements -- version 1
-pub type nvmlVgpuPlacementList_t = nvmlVgpuPlacementList_v1_t;
+/// Structure to store the list of vGPU placements -- version 2
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlVgpuPlacementList_v2_t {
+    ///!< IN: The version number of this struct
+    pub version: ::core::ffi::c_uint,
+    ///!< OUT: The number of slots occupied by the vGPU type
+    pub placementSize: ::core::ffi::c_uint,
+    ///!< IN/OUT: Count of the placement IDs
+    pub count: ::core::ffi::c_uint,
+    ///!< IN/OUT: Placement IDs for the vGPU type
+    pub placementIds: *mut ::core::ffi::c_uint,
+    ///!< IN: The vGPU mode. Either NVML_VGPU_PGPU_HETEROGENEOUS_MODE or NVML_VGPU_PGPU_HOMOGENEOUS_MODE
+    pub mode: ::core::ffi::c_uint,
+}
+/// Structure to store the list of vGPU placements -- version 2
+pub type nvmlVgpuPlacementList_t = nvmlVgpuPlacementList_v2_t;
+/// Structure to store BAR1 size information of vGPU type -- Version 1
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlVgpuTypeBar1Info_v1_t {
+    ///!< The version number of this struct
+    pub version: ::core::ffi::c_uint,
+    ///!< BAR1 size in megabytes
+    pub bar1Size: ::core::ffi::c_ulonglong,
+}
+/// Structure to store BAR1 size information of vGPU type -- Version 1
+pub type nvmlVgpuTypeBar1Info_t = nvmlVgpuTypeBar1Info_v1_t;
 /// Structure to store Utilization Value and vgpuInstance
 #[repr(C)]
 #[derive(Copy, Clone)]
@@ -2305,6 +2898,17 @@ pub struct nvmlVgpuProcessesUtilizationInfo_v1_t {
 }
 /// Structure to store recent utilization, vgpuInstance and subprocess information for processes running on vGPU instances active on a device -- version 1
 pub type nvmlVgpuProcessesUtilizationInfo_t = nvmlVgpuProcessesUtilizationInfo_v1_t;
+/// Structure to store the information of vGPU runtime state -- version 1
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlVgpuRuntimeState_v1_t {
+    ///!< IN:  The version number of this struct
+    pub version: ::core::ffi::c_uint,
+    ///!< OUT: The runtime state size of the vGPU instance
+    pub size: ::core::ffi::c_ulonglong,
+}
+/// Structure to store the information of vGPU runtime state -- version 1
+pub type nvmlVgpuRuntimeState_t = nvmlVgpuRuntimeState_v1_t;
 /// Union to represent the vGPU Scheduler Parameters
 #[repr(C)]
 #[derive(Copy, Clone)]
@@ -2462,61 +3066,6 @@ pub struct nvmlVgpuLicenseInfo_st {
     pub currentState: ::core::ffi::c_uint,
 }
 pub type nvmlVgpuLicenseInfo_t = nvmlVgpuLicenseInfo_st;
-/// Structure to store utilization value and process Id
-#[repr(C)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct nvmlProcessUtilizationSample_st {
-    ///!< PID of process
-    pub pid: ::core::ffi::c_uint,
-    ///!< CPU Timestamp in microseconds
-    pub timeStamp: ::core::ffi::c_ulonglong,
-    ///!< SM (3D/Compute) Util Value
-    pub smUtil: ::core::ffi::c_uint,
-    ///!< Frame Buffer Memory Util Value
-    pub memUtil: ::core::ffi::c_uint,
-    ///!< Encoder Util Value
-    pub encUtil: ::core::ffi::c_uint,
-    ///!< Decoder Util Value
-    pub decUtil: ::core::ffi::c_uint,
-}
-/// Structure to store utilization value and process Id
-pub type nvmlProcessUtilizationSample_t = nvmlProcessUtilizationSample_st;
-/// Structure to store utilization value and process Id -- version 1
-#[repr(C)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct nvmlProcessUtilizationInfo_v1_t {
-    ///!< CPU Timestamp in microseconds
-    pub timeStamp: ::core::ffi::c_ulonglong,
-    ///!< PID of process
-    pub pid: ::core::ffi::c_uint,
-    ///!< SM (3D/Compute) Util Value
-    pub smUtil: ::core::ffi::c_uint,
-    ///!< Frame Buffer Memory Util Value
-    pub memUtil: ::core::ffi::c_uint,
-    ///!< Encoder Util Value
-    pub encUtil: ::core::ffi::c_uint,
-    ///!< Decoder Util Value
-    pub decUtil: ::core::ffi::c_uint,
-    ///!< Jpeg Util Value
-    pub jpgUtil: ::core::ffi::c_uint,
-    ///!< Ofa Util Value
-    pub ofaUtil: ::core::ffi::c_uint,
-}
-/// Structure to store utilization and process ID for each running process -- version 1
-#[repr(C)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct nvmlProcessesUtilizationInfo_v1_t {
-    ///!< The version number of this struct
-    pub version: ::core::ffi::c_uint,
-    ///!< Caller-supplied array size, and returns number of processes running
-    pub processSamplesCount: ::core::ffi::c_uint,
-    ///!< Return only samples with timestamp greater than lastSeenTimeStamp
-    pub lastSeenTimeStamp: ::core::ffi::c_ulonglong,
-    ///!< The array (allocated by caller) of the utilization of GPU SM, framebuffer, video encoder, video decoder, JPEG, and OFA
-    pub procUtilArray: *mut nvmlProcessUtilizationInfo_v1_t,
-}
-/// Structure to store utilization and process ID for each running process -- version 1
-pub type nvmlProcessesUtilizationInfo_t = nvmlProcessesUtilizationInfo_v1_t;
 /// Structure to store license expiry date and time values
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
@@ -2570,90 +3119,44 @@ pub struct nvmlGridLicensableFeatures_st {
 }
 /// Structure to store vGPU software licensable features
 pub type nvmlGridLicensableFeatures_t = nvmlGridLicensableFeatures_st;
-/// Structure to store SRAM uncorrectable error counters
-#[repr(C)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct nvmlEccSramErrorStatus_v1_t {
-    ///!< the API version number
-    pub version: ::core::ffi::c_uint,
-    ///!< aggregate uncorrectable parity error count
-    pub aggregateUncParity: ::core::ffi::c_ulonglong,
-    ///!< aggregate uncorrectable SEC-DED error count
-    pub aggregateUncSecDed: ::core::ffi::c_ulonglong,
-    ///!< aggregate correctable error count
-    pub aggregateCor: ::core::ffi::c_ulonglong,
-    ///!< volatile uncorrectable parity error count
-    pub volatileUncParity: ::core::ffi::c_ulonglong,
-    ///!< volatile uncorrectable SEC-DED error count
-    pub volatileUncSecDed: ::core::ffi::c_ulonglong,
-    ///!< volatile correctable error count
-    pub volatileCor: ::core::ffi::c_ulonglong,
-    ///!< aggregate uncorrectable error count for L2 cache bucket
-    pub aggregateUncBucketL2: ::core::ffi::c_ulonglong,
-    ///!< aggregate uncorrectable error count for SM bucket
-    pub aggregateUncBucketSm: ::core::ffi::c_ulonglong,
-    ///!< aggregate uncorrectable error count for PCIE bucket
-    pub aggregateUncBucketPcie: ::core::ffi::c_ulonglong,
-    ///!< aggregate uncorrectable error count for Microcontroller bucket
-    pub aggregateUncBucketMcu: ::core::ffi::c_ulonglong,
-    ///!< aggregate uncorrectable error count for Other bucket
-    pub aggregateUncBucketOther: ::core::ffi::c_ulonglong,
-    ///!< if the error threshold of field diag is exceeded
-    pub bThresholdExceeded: ::core::ffi::c_uint,
-}
-/// Structure to store SRAM uncorrectable error counters
-pub type nvmlEccSramErrorStatus_t = nvmlEccSramErrorStatus_v1_t;
-pub type nvmlDeviceArchitecture_t = ::core::ffi::c_uint;
-pub type nvmlBusType_t = ::core::ffi::c_uint;
-pub type nvmlFanControlPolicy_t = ::core::ffi::c_uint;
-pub type nvmlPowerSource_t = ::core::ffi::c_uint;
-impl nvmlGpuUtilizationDomainId_t {
-    ///!< Graphics engine domain
-    pub const NVML_GPU_UTILIZATION_DOMAIN_GPU: nvmlGpuUtilizationDomainId_t = nvmlGpuUtilizationDomainId_t(
+impl nvmlDeviceGpuRecoveryAction_s {
+    pub const NVML_GPU_RECOVERY_ACTION_NONE: nvmlDeviceGpuRecoveryAction_s = nvmlDeviceGpuRecoveryAction_s(
         0,
     );
 }
-impl nvmlGpuUtilizationDomainId_t {
-    ///!< Frame buffer domain
-    pub const NVML_GPU_UTILIZATION_DOMAIN_FB: nvmlGpuUtilizationDomainId_t = nvmlGpuUtilizationDomainId_t(
+impl nvmlDeviceGpuRecoveryAction_s {
+    pub const NVML_GPU_RECOVERY_ACTION_GPU_RESET: nvmlDeviceGpuRecoveryAction_s = nvmlDeviceGpuRecoveryAction_s(
         1,
     );
 }
-impl nvmlGpuUtilizationDomainId_t {
-    ///!< Video engine domain
-    pub const NVML_GPU_UTILIZATION_DOMAIN_VID: nvmlGpuUtilizationDomainId_t = nvmlGpuUtilizationDomainId_t(
+impl nvmlDeviceGpuRecoveryAction_s {
+    pub const NVML_GPU_RECOVERY_ACTION_NODE_REBOOT: nvmlDeviceGpuRecoveryAction_s = nvmlDeviceGpuRecoveryAction_s(
         2,
     );
 }
-impl nvmlGpuUtilizationDomainId_t {
-    ///!< Bus interface domain
-    pub const NVML_GPU_UTILIZATION_DOMAIN_BUS: nvmlGpuUtilizationDomainId_t = nvmlGpuUtilizationDomainId_t(
+impl nvmlDeviceGpuRecoveryAction_s {
+    pub const NVML_GPU_RECOVERY_ACTION_DRAIN_P2P: nvmlDeviceGpuRecoveryAction_s = nvmlDeviceGpuRecoveryAction_s(
         3,
     );
 }
+impl nvmlDeviceGpuRecoveryAction_s {
+    pub const NVML_GPU_RECOVERY_ACTION_DRAIN_AND_RESET: nvmlDeviceGpuRecoveryAction_s = nvmlDeviceGpuRecoveryAction_s(
+        4,
+    );
+}
 #[repr(transparent)]
+/// Enum describing the GPU Recovery Action
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct nvmlGpuUtilizationDomainId_t(pub ::core::ffi::c_uint);
+pub struct nvmlDeviceGpuRecoveryAction_s(pub ::core::ffi::c_uint);
+/// Enum describing the GPU Recovery Action
+pub use self::nvmlDeviceGpuRecoveryAction_s as nvmlDeviceGpuRecoveryAction_t;
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct nvmlGpuDynamicPstatesInfo_st {
-    ///!< Reserved for future use
-    pub flags: ::core::ffi::c_uint,
-    pub utilization: [nvmlGpuDynamicPstatesInfo_st__bindgen_ty_1; 8usize],
+pub struct nvmlNvLinkPowerThres_st {
+    ///!< Low power threshold
+    pub lowPwrThreshold: ::core::ffi::c_uint,
 }
-#[repr(C)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct nvmlGpuDynamicPstatesInfo_st__bindgen_ty_1 {
-    ///!< Set if this utilization domain is present on this GPU
-    pub bIsPresent: ::core::ffi::c_uint,
-    ///!< Percentage of time where the domain is considered busy in the last 1-second interval
-    pub percentage: ::core::ffi::c_uint,
-    ///!< Utilization threshold that can trigger a perf-increasing P-State change when crossed
-    pub incThreshold: ::core::ffi::c_uint,
-    ///!< Utilization threshold that can trigger a perf-decreasing P-State change when crossed
-    pub decThreshold: ::core::ffi::c_uint,
-}
-pub type nvmlGpuDynamicPstatesInfo_t = nvmlGpuDynamicPstatesInfo_st;
+pub type nvmlNvLinkPowerThres_t = nvmlNvLinkPowerThres_st;
 /// Information for a Field Value Sample
 #[repr(C)]
 #[derive(Copy, Clone)]
@@ -2821,7 +3324,7 @@ pub struct nvmlEventData_st {
     pub device: nvmlDevice_t,
     ///!< Information about what specific event occurred
     pub eventType: ::core::ffi::c_ulonglong,
-    ///!< Stores XID error for the device in the event of nvmlEventTypeXidCriticalError,
+    ///!< Stores Xid error for the device in the event of nvmlEventTypeXidCriticalError,
     pub eventData: ::core::ffi::c_ulonglong,
     ///!< If MIG is enabled and nvmlEventTypeXidCriticalError event is attributable to a GPU
     pub gpuInstanceId: ::core::ffi::c_uint,
@@ -3086,6 +3589,7 @@ pub struct nvmlConfComputeGetKeyRotationThresholdInfo_st {
 pub type nvmlConfComputeGetKeyRotationThresholdInfo_v1_t = nvmlConfComputeGetKeyRotationThresholdInfo_st;
 pub type nvmlConfComputeGetKeyRotationThresholdInfo_t = nvmlConfComputeGetKeyRotationThresholdInfo_v1_t;
 pub type nvmlGpuFabricState_t = ::core::ffi::c_uchar;
+/// Contains the device fabric information
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct nvmlGpuFabricInfo_t {
@@ -3107,7 +3611,7 @@ pub struct nvmlGpuFabricInfo_t {
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct nvmlGpuFabricInfo_v2_t {
-    ///!< Structure version identifier (set to \ref nvmlGpuFabricInfo_v2)
+    ///!< Structure version identifier (set to nvmlGpuFabricInfo_v2)
     pub version: ::core::ffi::c_uint,
     ///!< Uuid of the cluster to which this GPU belongs
     pub clusterUuid: [::core::ffi::c_uchar; 16usize],
@@ -3127,18 +3631,28 @@ pub struct nvmlGpuFabricInfo_v2_t {
  field to the end. This structure is not backwards-compatible with
  \ref nvmlGpuFabricInfo_t.*/
 pub type nvmlGpuFabricInfoV_t = nvmlGpuFabricInfo_v2_t;
-pub type nvmlPowerScopeType_t = ::core::ffi::c_uchar;
+/// Structure to store Driver branch information
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct nvmlPowerValue_v2_t {
-    ///!< Structure format version (must be 1)
+pub struct nvmlSystemDriverBranchInfo_v1_t {
+    ///!< The version number of this struct
     pub version: ::core::ffi::c_uint,
-    ///!< [in]  Device type: GPU or Total Module
-    pub powerScope: nvmlPowerScopeType_t,
-    ///!< [out] Power value to retrieve or set in milliwatts
-    pub powerValueMw: ::core::ffi::c_uint,
+    ///!< driver branch
+    pub branch: [::core::ffi::c_char; 80usize],
 }
+/// Structure to store Driver branch information
+pub type nvmlSystemDriverBranchInfo_t = nvmlSystemDriverBranchInfo_v1_t;
 pub type nvmlAffinityScope_t = ::core::ffi::c_uint;
+/// Structure used to encapsulate temperature info
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlTemperature_v1_t {
+    pub version: ::core::ffi::c_uint,
+    pub sensorType: nvmlTemperatureSensors_t,
+    pub temperature: ::core::ffi::c_int,
+}
+/// Structure used to encapsulate temperature info
+pub type nvmlTemperature_t = nvmlTemperature_v1_t;
 impl nvmlClockLimitId_enum {
     pub const NVML_CLOCK_LIMIT_ID_RANGE_START: nvmlClockLimitId_enum = nvmlClockLimitId_enum(
         4294967040,
@@ -3158,6 +3672,30 @@ impl nvmlClockLimitId_enum {
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct nvmlClockLimitId_enum(pub ::core::ffi::c_uint);
 pub use self::nvmlClockLimitId_enum as nvmlClockLimitId_t;
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlNvlinkSupportedBwModes_v1_t {
+    pub version: ::core::ffi::c_uint,
+    pub bwModes: [::core::ffi::c_uchar; 23usize],
+    pub totalBwModes: ::core::ffi::c_uchar,
+}
+pub type nvmlNvlinkSupportedBwModes_t = nvmlNvlinkSupportedBwModes_v1_t;
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlNvlinkGetBwMode_v1_t {
+    pub version: ::core::ffi::c_uint,
+    pub bIsBest: ::core::ffi::c_uint,
+    pub bwMode: ::core::ffi::c_uchar,
+}
+pub type nvmlNvlinkGetBwMode_t = nvmlNvlinkGetBwMode_v1_t;
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlNvlinkSetBwMode_v1_t {
+    pub version: ::core::ffi::c_uint,
+    pub bSetBest: ::core::ffi::c_uint,
+    pub bwMode: ::core::ffi::c_uchar,
+}
+pub type nvmlNvlinkSetBwMode_t = nvmlNvlinkSetBwMode_v1_t;
 /// Structure representing range of vGPU versions.
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
@@ -3313,10 +3851,6 @@ pub struct nvmlExcludedDeviceInfo_st {
 }
 /// Excluded GPU device information
 pub type nvmlExcludedDeviceInfo_t = nvmlExcludedDeviceInfo_st;
-/** MIG compute instance profile capability.
-
- Bit field values representing MIG profile capabilities
- \ref nvmlComputeInstanceProfileInfo_v3_t.capabilities*/
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct nvmlGpuInstancePlacement_st {
@@ -3325,10 +3859,6 @@ pub struct nvmlGpuInstancePlacement_st {
     ///!< Number of memory slices occupied
     pub size: ::core::ffi::c_uint,
 }
-/** MIG compute instance profile capability.
-
- Bit field values representing MIG profile capabilities
- \ref nvmlComputeInstanceProfileInfo_v3_t.capabilities*/
 pub type nvmlGpuInstancePlacement_t = nvmlGpuInstancePlacement_st;
 /// GPU instance profile information.
 #[repr(C)]
@@ -3715,6 +4245,10 @@ impl nvmlGpmMetricId_t {
     ///!< Percent utilization of NVOFA 0. 0.0 - 100.0
     pub const NVML_GPM_METRIC_NVOFA_0_UTIL: nvmlGpmMetricId_t = nvmlGpmMetricId_t(50);
 }
+impl nvmlGpmMetricId_t {
+    ///!< Percent utilization of NVOFA 1. 0.0 - 100.0
+    pub const NVML_GPM_METRIC_NVOFA_1_UTIL: nvmlGpmMetricId_t = nvmlGpmMetricId_t(51);
+}
 impl nvmlGpmMetricId_t {
     ///!< NvLink read bandwidth for all links in MiB/sec
     pub const NVML_GPM_METRIC_NVLINK_TOTAL_RX_PER_SEC: nvmlGpmMetricId_t = nvmlGpmMetricId_t(
@@ -3962,7 +4496,7 @@ pub type nvmlGpmSample_t = *mut nvmlGpmSample_st;
 #[repr(C)]
 #[derive(Debug, Copy, Clone, PartialEq)]
 pub struct nvmlGpmMetric_t {
-    ///!<  IN: NVML_GPM_METRIC_? #define of which metric to retrieve
+    ///!<  IN: NVML_GPM_METRIC_? define of which metric to retrieve
     pub metricId: ::core::ffi::c_uint,
     ///!<  OUT: Status of this metric. If this is nonzero, then value is not valid
     pub nvmlReturn: nvmlReturn_t,
@@ -4002,13 +4536,175 @@ pub struct nvmlGpmSupport_t {
     ///!< OUT: Indicates device support
     pub isSupportedDevice: ::core::ffi::c_uint,
 }
+/// Device capabilities
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct nvmlNvLinkPowerThres_st {
-    ///!< Low power threshold (in units of 100us)
-    pub lowPwrThreshold: ::core::ffi::c_uint,
+pub struct nvmlDeviceCapabilities_v1_t {
+    ///!< the API version number
+    pub version: ::core::ffi::c_uint,
+    ///!< OUT: Bit mask of capabilities.
+    pub capMask: ::core::ffi::c_uint,
 }
-pub type nvmlNvLinkPowerThres_t = nvmlNvLinkPowerThres_st;
+/// Device capabilities
+pub type nvmlDeviceCapabilities_t = nvmlDeviceCapabilities_v1_t;
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlMask255_t {
+    pub mask: [::core::ffi::c_uint; 8usize],
+}
+impl nvmlPowerProfileType_t {
+    pub const NVML_POWER_PROFILE_MAX_P: nvmlPowerProfileType_t = nvmlPowerProfileType_t(
+        0,
+    );
+}
+impl nvmlPowerProfileType_t {
+    pub const NVML_POWER_PROFILE_MAX_Q: nvmlPowerProfileType_t = nvmlPowerProfileType_t(
+        1,
+    );
+}
+impl nvmlPowerProfileType_t {
+    pub const NVML_POWER_PROFILE_COMPUTE: nvmlPowerProfileType_t = nvmlPowerProfileType_t(
+        2,
+    );
+}
+impl nvmlPowerProfileType_t {
+    pub const NVML_POWER_PROFILE_MEMORY_BOUND: nvmlPowerProfileType_t = nvmlPowerProfileType_t(
+        3,
+    );
+}
+impl nvmlPowerProfileType_t {
+    pub const NVML_POWER_PROFILE_NETWORK: nvmlPowerProfileType_t = nvmlPowerProfileType_t(
+        4,
+    );
+}
+impl nvmlPowerProfileType_t {
+    pub const NVML_POWER_PROFILE_BALANCED: nvmlPowerProfileType_t = nvmlPowerProfileType_t(
+        5,
+    );
+}
+impl nvmlPowerProfileType_t {
+    pub const NVML_POWER_PROFILE_LLM_INFERENCE: nvmlPowerProfileType_t = nvmlPowerProfileType_t(
+        6,
+    );
+}
+impl nvmlPowerProfileType_t {
+    pub const NVML_POWER_PROFILE_LLM_TRAINING: nvmlPowerProfileType_t = nvmlPowerProfileType_t(
+        7,
+    );
+}
+impl nvmlPowerProfileType_t {
+    pub const NVML_POWER_PROFILE_RBM: nvmlPowerProfileType_t = nvmlPowerProfileType_t(8);
+}
+impl nvmlPowerProfileType_t {
+    pub const NVML_POWER_PROFILE_DCPCIE: nvmlPowerProfileType_t = nvmlPowerProfileType_t(
+        9,
+    );
+}
+impl nvmlPowerProfileType_t {
+    pub const NVML_POWER_PROFILE_HMMA_SPARSE: nvmlPowerProfileType_t = nvmlPowerProfileType_t(
+        10,
+    );
+}
+impl nvmlPowerProfileType_t {
+    pub const NVML_POWER_PROFILE_HMMA_DENSE: nvmlPowerProfileType_t = nvmlPowerProfileType_t(
+        11,
+    );
+}
+impl nvmlPowerProfileType_t {
+    pub const NVML_POWER_PROFILE_SYNC_BALANCED: nvmlPowerProfileType_t = nvmlPowerProfileType_t(
+        12,
+    );
+}
+impl nvmlPowerProfileType_t {
+    pub const NVML_POWER_PROFILE_HPC: nvmlPowerProfileType_t = nvmlPowerProfileType_t(
+        13,
+    );
+}
+impl nvmlPowerProfileType_t {
+    pub const NVML_POWER_PROFILE_MIG: nvmlPowerProfileType_t = nvmlPowerProfileType_t(
+        14,
+    );
+}
+impl nvmlPowerProfileType_t {
+    pub const NVML_POWER_PROFILE_MAX: nvmlPowerProfileType_t = nvmlPowerProfileType_t(
+        15,
+    );
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlPowerProfileType_t(pub ::core::ffi::c_uint);
+/// Profile Metadata
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlWorkloadPowerProfileInfo_v1_t {
+    ///!< the API version number
+    pub version: ::core::ffi::c_uint,
+    pub profileId: ::core::ffi::c_uint,
+    pub priority: ::core::ffi::c_uint,
+    pub conflictingMask: nvmlMask255_t,
+}
+/// Profile Metadata
+pub type nvmlWorkloadPowerProfileInfo_t = nvmlWorkloadPowerProfileInfo_v1_t;
+/// Profiles Info
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlWorkloadPowerProfileProfilesInfo_v1_t {
+    ///!< the API version number
+    pub version: ::core::ffi::c_uint,
+    ///!< Mask bit set to true for each valid performance profile
+    pub perfProfilesMask: nvmlMask255_t,
+    ///!< Array of performance profile info parameters
+    pub perfProfile: [nvmlWorkloadPowerProfileInfo_t; 255usize],
+}
+/// Profiles Info
+pub type nvmlWorkloadPowerProfileProfilesInfo_t = nvmlWorkloadPowerProfileProfilesInfo_v1_t;
+/// Current Profiles
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlWorkloadPowerProfileCurrentProfiles_v1_t {
+    pub version: ::core::ffi::c_uint,
+    ///!< Mask bit set to true for each valid performance profile
+    pub perfProfilesMask: nvmlMask255_t,
+    ///!< Mask of currently requested performance profiles
+    pub requestedProfilesMask: nvmlMask255_t,
+    ///!< Mask of currently enforced performance profiles post all arbitrations among the requested profiles.
+    pub enforcedProfilesMask: nvmlMask255_t,
+}
+/// Current Profiles
+pub type nvmlWorkloadPowerProfileCurrentProfiles_t = nvmlWorkloadPowerProfileCurrentProfiles_v1_t;
+/// Requested Profiles
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlWorkloadPowerProfileRequestedProfiles_v1_t {
+    ///!< the API version number
+    pub version: ::core::ffi::c_uint,
+    ///!< Mask of 255 bits, each bit representing index of respective perf profile
+    pub requestedProfilesMask: nvmlMask255_t,
+}
+/// Requested Profiles
+pub type nvmlWorkloadPowerProfileRequestedProfiles_t = nvmlWorkloadPowerProfileRequestedProfiles_v1_t;
+#[repr(C)]
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub struct nvmlPowerSmoothingProfile_v1_t {
+    ///!< the API version number
+    pub version: ::core::ffi::c_uint,
+    ///!< The requested profile ID
+    pub profileId: ::core::ffi::c_uint,
+    ///!< The requested paramater ID
+    pub paramId: ::core::ffi::c_uint,
+    ///!< The requested value for the given parameter
+    pub value: f64,
+}
+pub type nvmlPowerSmoothingProfile_t = nvmlPowerSmoothingProfile_v1_t;
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct nvmlPowerSmoothingState_v1_t {
+    ///!< the API version number
+    pub version: ::core::ffi::c_uint,
+    ///!< 0/Disabled or 1/Enabled
+    pub state: nvmlEnableState_t,
+}
+pub type nvmlPowerSmoothingState_t = nvmlPowerSmoothingState_v1_t;
 impl nvmlError_t {
     pub const UNINITIALIZED: nvmlError_t = nvmlError_t(unsafe {
         ::core::num::NonZeroU32::new_unchecked(1)
diff --git a/ext/hip_runtime-sys/src/lib.rs b/ext/hip_runtime-sys/src/lib.rs
index 4aad7e6..9ce281e 100644
--- a/ext/hip_runtime-sys/src/lib.rs
+++ b/ext/hip_runtime-sys/src/lib.rs
@@ -99,12 +99,18 @@ pub const hipEventDefault: u32 = 0;
 pub const hipEventBlockingSync: u32 = 1;
 pub const hipEventDisableTiming: u32 = 2;
 pub const hipEventInterprocess: u32 = 4;
+pub const hipEventRecordDefault: u32 = 0;
+pub const hipEventRecordExternal: u32 = 1;
 pub const hipEventDisableSystemFence: u32 = 536870912;
 pub const hipEventReleaseToDevice: u32 = 1073741824;
 pub const hipEventReleaseToSystem: u32 = 2147483648;
+pub const hipHostAllocDefault: u32 = 0;
 pub const hipHostMallocDefault: u32 = 0;
+pub const hipHostAllocPortable: u32 = 1;
 pub const hipHostMallocPortable: u32 = 1;
+pub const hipHostAllocMapped: u32 = 2;
 pub const hipHostMallocMapped: u32 = 2;
+pub const hipHostAllocWriteCombined: u32 = 4;
 pub const hipHostMallocWriteCombined: u32 = 4;
 pub const hipHostMallocNumaUser: u32 = 536870912;
 pub const hipHostMallocCoherent: u32 = 1073741824;
@@ -148,6 +154,222 @@ pub const hipExternalMemoryDedicated: u32 = 1;
 pub const hipGraphKernelNodePortDefault: u32 = 0;
 pub const hipGraphKernelNodePortLaunchCompletion: u32 = 2;
 pub const hipGraphKernelNodePortProgrammatic: u32 = 1;
+impl hipJitOption {
+    /**< CUDA Only Maximum registers may be used in a thread,
+< passed to compiler*/
+    pub const hipJitOptionMaxRegisters: hipJitOption = hipJitOption(0);
+}
+impl hipJitOption {
+    ///< CUDA Only Number of thread per block
+    pub const hipJitOptionThreadsPerBlock: hipJitOption = hipJitOption(1);
+}
+impl hipJitOption {
+    ///< CUDA Only Value for total wall clock time
+    pub const hipJitOptionWallTime: hipJitOption = hipJitOption(2);
+}
+impl hipJitOption {
+    ///< CUDA Only Pointer to the buffer with logged information
+    pub const hipJitOptionInfoLogBuffer: hipJitOption = hipJitOption(3);
+}
+impl hipJitOption {
+    ///< CUDA Only Size of the buffer in bytes for logged info
+    pub const hipJitOptionInfoLogBufferSizeBytes: hipJitOption = hipJitOption(4);
+}
+impl hipJitOption {
+    ///< CUDA Only Pointer to the buffer with logged error(s)
+    pub const hipJitOptionErrorLogBuffer: hipJitOption = hipJitOption(5);
+}
+impl hipJitOption {
+    ///< CUDA Only Size of the buffer in bytes for logged error(s)
+    pub const hipJitOptionErrorLogBufferSizeBytes: hipJitOption = hipJitOption(6);
+}
+impl hipJitOption {
+    /**< Value of optimization level for generated codes, acceptable options
+< -O0, -O1, -O2, -O3*/
+    pub const hipJitOptionOptimizationLevel: hipJitOption = hipJitOption(7);
+}
+impl hipJitOption {
+    ///< CUDA Only The target context, which is the default
+    pub const hipJitOptionTargetFromContext: hipJitOption = hipJitOption(8);
+}
+impl hipJitOption {
+    ///< CUDA Only JIT target
+    pub const hipJitOptionTarget: hipJitOption = hipJitOption(9);
+}
+impl hipJitOption {
+    ///< CUDA Only Fallback strategy
+    pub const hipJitOptionFallbackStrategy: hipJitOption = hipJitOption(10);
+}
+impl hipJitOption {
+    ///< CUDA Only Generate debug information
+    pub const hipJitOptionGenerateDebugInfo: hipJitOption = hipJitOption(11);
+}
+impl hipJitOption {
+    ///< CUDA Only Generate log verbose
+    pub const hipJitOptionLogVerbose: hipJitOption = hipJitOption(12);
+}
+impl hipJitOption {
+    ///< CUDA Only Generate line number information
+    pub const hipJitOptionGenerateLineInfo: hipJitOption = hipJitOption(13);
+}
+impl hipJitOption {
+    ///< CUDA Only Set cache mode
+    pub const hipJitOptionCacheMode: hipJitOption = hipJitOption(14);
+}
+impl hipJitOption {
+    ///< @deprecated CUDA Only New SM3X option.
+    pub const hipJitOptionSm3xOpt: hipJitOption = hipJitOption(15);
+}
+impl hipJitOption {
+    ///< CUDA Only Set fast compile
+    pub const hipJitOptionFastCompile: hipJitOption = hipJitOption(16);
+}
+impl hipJitOption {
+    ///< CUDA Only Array of device symbol names to be relocated to the host
+    pub const hipJitOptionGlobalSymbolNames: hipJitOption = hipJitOption(17);
+}
+impl hipJitOption {
+    ///< CUDA Only Array of host addresses to be relocated to the device
+    pub const hipJitOptionGlobalSymbolAddresses: hipJitOption = hipJitOption(18);
+}
+impl hipJitOption {
+    ///< CUDA Only Number of symbol count.
+    pub const hipJitOptionGlobalSymbolCount: hipJitOption = hipJitOption(19);
+}
+impl hipJitOption {
+    ///< @deprecated CUDA Only Enable link-time optimization for device code
+    pub const hipJitOptionLto: hipJitOption = hipJitOption(20);
+}
+impl hipJitOption {
+    ///< @deprecated CUDA Only Set single-precision denormals.
+    pub const hipJitOptionFtz: hipJitOption = hipJitOption(21);
+}
+impl hipJitOption {
+    /**< @deprecated CUDA Only Set single-precision floating-point division
+< and reciprocals*/
+    pub const hipJitOptionPrecDiv: hipJitOption = hipJitOption(22);
+}
+impl hipJitOption {
+    ///< @deprecated CUDA Only Set single-precision floating-point square root
+    pub const hipJitOptionPrecSqrt: hipJitOption = hipJitOption(23);
+}
+impl hipJitOption {
+    /**< @deprecated CUDA Only Enable floating-point multiplies and
+< adds/subtracts operations*/
+    pub const hipJitOptionFma: hipJitOption = hipJitOption(24);
+}
+impl hipJitOption {
+    ///< CUDA Only Generates Position Independent code
+    pub const hipJitOptionPositionIndependentCode: hipJitOption = hipJitOption(25);
+}
+impl hipJitOption {
+    /**< CUDA Only Hints to JIT compiler the minimum number of CTAs frin kernel's
+< grid to be mapped to SM*/
+    pub const hipJitOptionMinCTAPerSM: hipJitOption = hipJitOption(26);
+}
+impl hipJitOption {
+    ///< CUDA only Maximum number of threads in a thread block
+    pub const hipJitOptionMaxThreadsPerBlock: hipJitOption = hipJitOption(27);
+}
+impl hipJitOption {
+    ///< Cuda only Override Directive values
+    pub const hipJitOptionOverrideDirectiveValues: hipJitOption = hipJitOption(28);
+}
+impl hipJitOption {
+    ///< Number of options
+    pub const hipJitOptionNumOptions: hipJitOption = hipJitOption(29);
+}
+impl hipJitOption {
+    ///< Hip Only Linker options to be passed on to compiler
+    pub const hipJitOptionIRtoISAOptExt: hipJitOption = hipJitOption(10000);
+}
+impl hipJitOption {
+    ///< Hip Only Count of linker options to be passed on to compiler
+    pub const hipJitOptionIRtoISAOptCountExt: hipJitOption = hipJitOption(10001);
+}
+#[repr(transparent)]
+/// hipJitOption
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct hipJitOption(pub ::core::ffi::c_uint);
+impl hipJitInputType {
+    ///< Cuda only Input cubin
+    pub const hipJitInputCubin: hipJitInputType = hipJitInputType(0);
+}
+impl hipJitInputType {
+    ///< Cuda only Input PTX
+    pub const hipJitInputPtx: hipJitInputType = hipJitInputType(1);
+}
+impl hipJitInputType {
+    ///< Cuda Only Input FAT Binary
+    pub const hipJitInputFatBinary: hipJitInputType = hipJitInputType(2);
+}
+impl hipJitInputType {
+    ///< Cuda Only Host Object with embedded device code
+    pub const hipJitInputObject: hipJitInputType = hipJitInputType(3);
+}
+impl hipJitInputType {
+    /**< Cuda Only Archive of Host Objects with embedded
+< device code*/
+    pub const hipJitInputLibrary: hipJitInputType = hipJitInputType(4);
+}
+impl hipJitInputType {
+    /**< @deprecated Cuda only High Level intermediate
+< code for LTO*/
+    pub const hipJitInputNvvm: hipJitInputType = hipJitInputType(5);
+}
+impl hipJitInputType {
+    ///< Count of Legacy Input Types
+    pub const hipJitNumLegacyInputTypes: hipJitInputType = hipJitInputType(6);
+}
+impl hipJitInputType {
+    ///< HIP Only LLVM Bitcode or IR assembly
+    pub const hipJitInputLLVMBitcode: hipJitInputType = hipJitInputType(100);
+}
+impl hipJitInputType {
+    ///< HIP Only LLVM Clang Bundled Code
+    pub const hipJitInputLLVMBundledBitcode: hipJitInputType = hipJitInputType(101);
+}
+impl hipJitInputType {
+    ///< HIP Only LLVM Archive of Bundled Bitcode
+    pub const hipJitInputLLVMArchivesOfBundledBitcode: hipJitInputType = hipJitInputType(
+        102,
+    );
+}
+impl hipJitInputType {
+    ///< HIP Only SPIRV Code Object
+    pub const hipJitInputSpirv: hipJitInputType = hipJitInputType(103);
+}
+impl hipJitInputType {
+    ///< Count of Input Types
+    pub const hipJitNumInputTypes: hipJitInputType = hipJitInputType(10);
+}
+#[repr(transparent)]
+/// hipJitInputType
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct hipJitInputType(pub ::core::ffi::c_uint);
+impl hipJitCacheMode {
+    pub const hipJitCacheOptionNone: hipJitCacheMode = hipJitCacheMode(0);
+}
+impl hipJitCacheMode {
+    pub const hipJitCacheOptionCG: hipJitCacheMode = hipJitCacheMode(1);
+}
+impl hipJitCacheMode {
+    pub const hipJitCacheOptionCA: hipJitCacheMode = hipJitCacheMode(2);
+}
+#[repr(transparent)]
+/// hipJitCacheMode
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct hipJitCacheMode(pub ::core::ffi::c_uint);
+impl hipJitFallback {
+    pub const hipJitPreferPTX: hipJitFallback = hipJitFallback(0);
+}
+impl hipJitFallback {
+    pub const hipJitPreferBinary: hipJitFallback = hipJitFallback(1);
+}
+#[repr(transparent)]
+/// hipJitFallback
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct hipJitFallback(pub ::core::ffi::c_uint);
 #[doc = " @defgroup GlobalDefs Global enum and defines\n @{\n\n/\n/**\n hipDeviceArch_t\n"]
 #[repr(C)]
 #[repr(align(4))]
@@ -1605,26 +1827,36 @@ impl hipGPUDirectRDMAWritesOrdering {
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct hipGPUDirectRDMAWritesOrdering(pub ::core::ffi::c_uint);
+/**  @defgroup DriverTypes Driver Types
+  @{
+  This section describes the driver data types.
+*/
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct hipDeviceptr_t(pub *mut ::core::ffi::c_void);
 impl hipChannelFormatKind {
+    ///< Signed channel format
     pub const hipChannelFormatKindSigned: hipChannelFormatKind = hipChannelFormatKind(0);
 }
 impl hipChannelFormatKind {
+    ///< Unsigned channel format
     pub const hipChannelFormatKindUnsigned: hipChannelFormatKind = hipChannelFormatKind(
         1,
     );
 }
 impl hipChannelFormatKind {
+    ///< Float channel format
     pub const hipChannelFormatKindFloat: hipChannelFormatKind = hipChannelFormatKind(2);
 }
 impl hipChannelFormatKind {
+    ///< No channel format
     pub const hipChannelFormatKindNone: hipChannelFormatKind = hipChannelFormatKind(3);
 }
 #[repr(transparent)]
+/// HIP channel format kinds
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct hipChannelFormatKind(pub ::core::ffi::c_uint);
+/// HIP channel format descriptor
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct hipChannelFormatDesc {
@@ -1632,6 +1864,7 @@ pub struct hipChannelFormatDesc {
     pub y: ::core::ffi::c_int,
     pub z: ::core::ffi::c_int,
     pub w: ::core::ffi::c_int,
+    ///< Channel format kind
     pub f: hipChannelFormatKind,
 }
 #[repr(C)]
@@ -1642,106 +1875,161 @@ pub struct hipArray {
 pub type hipArray_t = *mut hipArray;
 pub type hipArray_const_t = *const hipArray;
 impl hipArray_Format {
+    ///< Unsigned 8-bit array format
     pub const HIP_AD_FORMAT_UNSIGNED_INT8: hipArray_Format = hipArray_Format(1);
 }
 impl hipArray_Format {
+    ///< Unsigned 16-bit array format
     pub const HIP_AD_FORMAT_UNSIGNED_INT16: hipArray_Format = hipArray_Format(2);
 }
 impl hipArray_Format {
+    ///< Unsigned 32-bit array format
     pub const HIP_AD_FORMAT_UNSIGNED_INT32: hipArray_Format = hipArray_Format(3);
 }
 impl hipArray_Format {
+    ///< Signed 8-bit array format
     pub const HIP_AD_FORMAT_SIGNED_INT8: hipArray_Format = hipArray_Format(8);
 }
 impl hipArray_Format {
+    ///< Signed 16-bit array format
     pub const HIP_AD_FORMAT_SIGNED_INT16: hipArray_Format = hipArray_Format(9);
 }
 impl hipArray_Format {
+    ///< Signed 32-bit array format
     pub const HIP_AD_FORMAT_SIGNED_INT32: hipArray_Format = hipArray_Format(10);
 }
 impl hipArray_Format {
+    ///< Half array format
     pub const HIP_AD_FORMAT_HALF: hipArray_Format = hipArray_Format(16);
 }
 impl hipArray_Format {
+    ///< Float array format
     pub const HIP_AD_FORMAT_FLOAT: hipArray_Format = hipArray_Format(32);
 }
 #[repr(transparent)]
+/// HIP array format
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct hipArray_Format(pub ::core::ffi::c_uint);
+/// HIP array descriptor
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct HIP_ARRAY_DESCRIPTOR {
+    ///< Width of the array
     pub Width: usize,
+    ///< Height of the array
     pub Height: usize,
+    ///< Format of the array
     pub Format: hipArray_Format,
+    ///< Number of channels of the array
     pub NumChannels: ::core::ffi::c_uint,
 }
+/// HIP 3D array descriptor
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct HIP_ARRAY3D_DESCRIPTOR {
+    ///< Width of the array
     pub Width: usize,
+    ///< Height of the array
     pub Height: usize,
+    ///< Depth of the array
     pub Depth: usize,
+    ///< Format of the array
     pub Format: hipArray_Format,
+    ///< Number of channels of the array
     pub NumChannels: ::core::ffi::c_uint,
+    ///< Flags of the array
     pub Flags: ::core::ffi::c_uint,
 }
+/// HIP 2D memory copy parameters
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct hip_Memcpy2D {
+    ///< Source width in bytes
     pub srcXInBytes: usize,
+    ///< Source height
     pub srcY: usize,
+    ///< Source memory type
     pub srcMemoryType: hipMemoryType,
+    ///< Source pointer
     pub srcHost: *const ::core::ffi::c_void,
+    ///< Source device
     pub srcDevice: hipDeviceptr_t,
+    ///< Source array
     pub srcArray: hipArray_t,
+    ///< Source pitch
     pub srcPitch: usize,
+    ///< Destination width in bytes
     pub dstXInBytes: usize,
+    ///< Destination height
     pub dstY: usize,
+    ///< Destination memory type
     pub dstMemoryType: hipMemoryType,
+    ///< Destination pointer
     pub dstHost: *mut ::core::ffi::c_void,
+    ///< Destination device
     pub dstDevice: hipDeviceptr_t,
+    ///< Destination array
     pub dstArray: hipArray_t,
+    ///< Destination pitch
     pub dstPitch: usize,
+    ///< Width in bytes of the 2D memory copy
     pub WidthInBytes: usize,
+    ///< Height of the 2D memory copy
     pub Height: usize,
 }
+/// HIP mipmapped array
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct hipMipmappedArray {
+    ///< Data pointer of the mipmapped array
     pub data: *mut ::core::ffi::c_void,
+    ///< Description of the mipmapped array
     pub desc: hipChannelFormatDesc,
+    ///< Type of the mipmapped array
     pub type_: ::core::ffi::c_uint,
+    ///< Width of the mipmapped array
     pub width: ::core::ffi::c_uint,
+    ///< Height of the mipmapped array
     pub height: ::core::ffi::c_uint,
+    ///< Depth of the mipmapped array
     pub depth: ::core::ffi::c_uint,
+    ///< Minimum level of the mipmapped array
     pub min_mipmap_level: ::core::ffi::c_uint,
+    ///< Maximum level of the mipmapped array
     pub max_mipmap_level: ::core::ffi::c_uint,
+    ///< Flags of the mipmapped array
     pub flags: ::core::ffi::c_uint,
+    ///< Format of the mipmapped array
     pub format: hipArray_Format,
+    ///< Number of channels of the mipmapped array
     pub num_channels: ::core::ffi::c_uint,
 }
+/// HIP mipmapped array pointer
 pub type hipMipmappedArray_t = *mut hipMipmappedArray;
 pub type hipmipmappedArray = hipMipmappedArray_t;
 pub type hipMipmappedArray_const_t = *const hipMipmappedArray;
 impl hipResourceType {
+    ///< Array resource
     pub const hipResourceTypeArray: hipResourceType = hipResourceType(0);
 }
 impl hipResourceType {
+    ///< Mipmapped array resource
     pub const hipResourceTypeMipmappedArray: hipResourceType = hipResourceType(1);
 }
 impl hipResourceType {
+    ///< Linear resource
     pub const hipResourceTypeLinear: hipResourceType = hipResourceType(2);
 }
 impl hipResourceType {
+    ///< Pitch 2D resource
     pub const hipResourceTypePitch2D: hipResourceType = hipResourceType(3);
 }
 #[repr(transparent)]
-/// hip resource types
+/// HIP resource types
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct hipResourceType(pub ::core::ffi::c_uint);
 impl HIPresourcetype_enum {
-    ///< Array resoure
+    ///< Array resource
     pub const HIP_RESOURCE_TYPE_ARRAY: HIPresourcetype_enum = HIPresourcetype_enum(0);
 }
 impl HIPresourcetype_enum {
@@ -1764,36 +2052,42 @@ pub struct HIPresourcetype_enum(pub ::core::ffi::c_uint);
 pub use self::HIPresourcetype_enum as HIPresourcetype;
 pub use self::HIPresourcetype_enum as hipResourcetype;
 impl HIPaddress_mode_enum {
+    ///< Wrap address mode
     pub const HIP_TR_ADDRESS_MODE_WRAP: HIPaddress_mode_enum = HIPaddress_mode_enum(0);
 }
 impl HIPaddress_mode_enum {
+    ///< Clamp address mode
     pub const HIP_TR_ADDRESS_MODE_CLAMP: HIPaddress_mode_enum = HIPaddress_mode_enum(1);
 }
 impl HIPaddress_mode_enum {
+    ///< Mirror address mode
     pub const HIP_TR_ADDRESS_MODE_MIRROR: HIPaddress_mode_enum = HIPaddress_mode_enum(2);
 }
 impl HIPaddress_mode_enum {
+    ///< Border address mode
     pub const HIP_TR_ADDRESS_MODE_BORDER: HIPaddress_mode_enum = HIPaddress_mode_enum(3);
 }
 #[repr(transparent)]
-/// hip address modes
+/// HIP texture address modes
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct HIPaddress_mode_enum(pub ::core::ffi::c_uint);
-/// hip address modes
+/// HIP texture address modes
 pub use self::HIPaddress_mode_enum as HIPaddress_mode;
 impl HIPfilter_mode_enum {
+    ///< Filter mode point
     pub const HIP_TR_FILTER_MODE_POINT: HIPfilter_mode_enum = HIPfilter_mode_enum(0);
 }
 impl HIPfilter_mode_enum {
+    ///< Filter mode linear
     pub const HIP_TR_FILTER_MODE_LINEAR: HIPfilter_mode_enum = HIPfilter_mode_enum(1);
 }
 #[repr(transparent)]
-/// hip filter modes
+/// HIP filter modes
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct HIPfilter_mode_enum(pub ::core::ffi::c_uint);
-/// hip filter modes
+/// HIP filter modes
 pub use self::HIPfilter_mode_enum as HIPfilter_mode;
-/// Texture descriptor
+/// HIP texture descriptor
 #[repr(C)]
 #[derive(Debug, Copy, Clone, PartialEq)]
 pub struct HIP_TEXTURE_DESC_st {
@@ -1817,171 +2111,206 @@ pub struct HIP_TEXTURE_DESC_st {
     pub borderColor: [f32; 4usize],
     pub reserved: [::core::ffi::c_int; 12usize],
 }
-/// Texture descriptor
+/// HIP texture descriptor
 pub type HIP_TEXTURE_DESC = HIP_TEXTURE_DESC_st;
 impl hipResourceViewFormat {
+    ///< No resource view format (use underlying resource format)
     pub const hipResViewFormatNone: hipResourceViewFormat = hipResourceViewFormat(0);
 }
 impl hipResourceViewFormat {
+    ///< 1 channel, unsigned 8-bit integers
     pub const hipResViewFormatUnsignedChar1: hipResourceViewFormat = hipResourceViewFormat(
         1,
     );
 }
 impl hipResourceViewFormat {
+    ///< 2 channels, unsigned 8-bit integers
     pub const hipResViewFormatUnsignedChar2: hipResourceViewFormat = hipResourceViewFormat(
         2,
     );
 }
 impl hipResourceViewFormat {
+    ///< 4 channels, unsigned 8-bit integers
     pub const hipResViewFormatUnsignedChar4: hipResourceViewFormat = hipResourceViewFormat(
         3,
     );
 }
 impl hipResourceViewFormat {
+    ///< 1 channel, signed 8-bit integers
     pub const hipResViewFormatSignedChar1: hipResourceViewFormat = hipResourceViewFormat(
         4,
     );
 }
 impl hipResourceViewFormat {
+    ///< 2 channels, signed 8-bit integers
     pub const hipResViewFormatSignedChar2: hipResourceViewFormat = hipResourceViewFormat(
         5,
     );
 }
 impl hipResourceViewFormat {
+    ///< 4 channels, signed 8-bit integers
     pub const hipResViewFormatSignedChar4: hipResourceViewFormat = hipResourceViewFormat(
         6,
     );
 }
 impl hipResourceViewFormat {
+    ///< 1 channel, unsigned 16-bit integers
     pub const hipResViewFormatUnsignedShort1: hipResourceViewFormat = hipResourceViewFormat(
         7,
     );
 }
 impl hipResourceViewFormat {
+    ///< 2 channels, unsigned 16-bit integers
     pub const hipResViewFormatUnsignedShort2: hipResourceViewFormat = hipResourceViewFormat(
         8,
     );
 }
 impl hipResourceViewFormat {
+    ///< 4 channels, unsigned 16-bit integers
     pub const hipResViewFormatUnsignedShort4: hipResourceViewFormat = hipResourceViewFormat(
         9,
     );
 }
 impl hipResourceViewFormat {
+    ///< 1 channel, signed 16-bit integers
     pub const hipResViewFormatSignedShort1: hipResourceViewFormat = hipResourceViewFormat(
         10,
     );
 }
 impl hipResourceViewFormat {
+    ///< 2 channels, signed 16-bit integers
     pub const hipResViewFormatSignedShort2: hipResourceViewFormat = hipResourceViewFormat(
         11,
     );
 }
 impl hipResourceViewFormat {
+    ///< 4 channels, signed 16-bit integers
     pub const hipResViewFormatSignedShort4: hipResourceViewFormat = hipResourceViewFormat(
         12,
     );
 }
 impl hipResourceViewFormat {
+    ///< 1 channel, unsigned 32-bit integers
     pub const hipResViewFormatUnsignedInt1: hipResourceViewFormat = hipResourceViewFormat(
         13,
     );
 }
 impl hipResourceViewFormat {
+    ///< 2 channels, unsigned 32-bit integers
     pub const hipResViewFormatUnsignedInt2: hipResourceViewFormat = hipResourceViewFormat(
         14,
     );
 }
 impl hipResourceViewFormat {
+    ///< 4 channels, unsigned 32-bit integers
     pub const hipResViewFormatUnsignedInt4: hipResourceViewFormat = hipResourceViewFormat(
         15,
     );
 }
 impl hipResourceViewFormat {
+    ///< 1 channel, signed 32-bit integers
     pub const hipResViewFormatSignedInt1: hipResourceViewFormat = hipResourceViewFormat(
         16,
     );
 }
 impl hipResourceViewFormat {
+    ///< 2 channels, signed 32-bit integers
     pub const hipResViewFormatSignedInt2: hipResourceViewFormat = hipResourceViewFormat(
         17,
     );
 }
 impl hipResourceViewFormat {
+    ///< 4 channels, signed 32-bit integers
     pub const hipResViewFormatSignedInt4: hipResourceViewFormat = hipResourceViewFormat(
         18,
     );
 }
 impl hipResourceViewFormat {
+    ///< 1 channel, 16-bit floating point
     pub const hipResViewFormatHalf1: hipResourceViewFormat = hipResourceViewFormat(19);
 }
 impl hipResourceViewFormat {
+    ///< 2 channels, 16-bit floating point
     pub const hipResViewFormatHalf2: hipResourceViewFormat = hipResourceViewFormat(20);
 }
 impl hipResourceViewFormat {
+    ///< 4 channels, 16-bit floating point
     pub const hipResViewFormatHalf4: hipResourceViewFormat = hipResourceViewFormat(21);
 }
 impl hipResourceViewFormat {
+    ///< 1 channel, 32-bit floating point
     pub const hipResViewFormatFloat1: hipResourceViewFormat = hipResourceViewFormat(22);
 }
 impl hipResourceViewFormat {
+    ///< 2 channels, 32-bit floating point
     pub const hipResViewFormatFloat2: hipResourceViewFormat = hipResourceViewFormat(23);
 }
 impl hipResourceViewFormat {
+    ///< 4 channels, 32-bit floating point
     pub const hipResViewFormatFloat4: hipResourceViewFormat = hipResourceViewFormat(24);
 }
 impl hipResourceViewFormat {
+    ///< Block-compressed 1
     pub const hipResViewFormatUnsignedBlockCompressed1: hipResourceViewFormat = hipResourceViewFormat(
         25,
     );
 }
 impl hipResourceViewFormat {
+    ///< Block-compressed 2
     pub const hipResViewFormatUnsignedBlockCompressed2: hipResourceViewFormat = hipResourceViewFormat(
         26,
     );
 }
 impl hipResourceViewFormat {
+    ///< Block-compressed 3
     pub const hipResViewFormatUnsignedBlockCompressed3: hipResourceViewFormat = hipResourceViewFormat(
         27,
     );
 }
 impl hipResourceViewFormat {
+    ///< Block-compressed 4 unsigned
     pub const hipResViewFormatUnsignedBlockCompressed4: hipResourceViewFormat = hipResourceViewFormat(
         28,
     );
 }
 impl hipResourceViewFormat {
+    ///< Block-compressed 4 signed
     pub const hipResViewFormatSignedBlockCompressed4: hipResourceViewFormat = hipResourceViewFormat(
         29,
     );
 }
 impl hipResourceViewFormat {
+    ///< Block-compressed 5 unsigned
     pub const hipResViewFormatUnsignedBlockCompressed5: hipResourceViewFormat = hipResourceViewFormat(
         30,
     );
 }
 impl hipResourceViewFormat {
+    ///< Block-compressed 5 signed
     pub const hipResViewFormatSignedBlockCompressed5: hipResourceViewFormat = hipResourceViewFormat(
         31,
     );
 }
 impl hipResourceViewFormat {
+    ///< Block-compressed 6 unsigned half-float
     pub const hipResViewFormatUnsignedBlockCompressed6H: hipResourceViewFormat = hipResourceViewFormat(
         32,
     );
 }
 impl hipResourceViewFormat {
+    ///< Block-compressed 6 signed half-float
     pub const hipResViewFormatSignedBlockCompressed6H: hipResourceViewFormat = hipResourceViewFormat(
         33,
     );
 }
 impl hipResourceViewFormat {
+    ///< Block-compressed 7
     pub const hipResViewFormatUnsignedBlockCompressed7: hipResourceViewFormat = hipResourceViewFormat(
         34,
     );
 }
 #[repr(transparent)]
-/// hip texture resource view formats
+/// HIP texture resource view formats
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct hipResourceViewFormat(pub ::core::ffi::c_uint);
 impl HIPresourceViewFormat_enum {
@@ -1991,217 +2320,220 @@ impl HIPresourceViewFormat_enum {
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 1 channel unsigned 8-bit integers
+    ///< 1 channel, unsigned 8-bit integers
     pub const HIP_RES_VIEW_FORMAT_UINT_1X8: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         1,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 2 channel unsigned 8-bit integers
+    ///< 2 channels, unsigned 8-bit integers
     pub const HIP_RES_VIEW_FORMAT_UINT_2X8: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         2,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 4 channel unsigned 8-bit integers
+    ///< 4 channels, unsigned 8-bit integers
     pub const HIP_RES_VIEW_FORMAT_UINT_4X8: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         3,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 1 channel signed 8-bit integers
+    ///< 1 channel, signed 8-bit integers
     pub const HIP_RES_VIEW_FORMAT_SINT_1X8: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         4,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 2 channel signed 8-bit integers
+    ///< 2 channels, signed 8-bit integers
     pub const HIP_RES_VIEW_FORMAT_SINT_2X8: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         5,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 4 channel signed 8-bit integers
+    ///< 4 channels, signed 8-bit integers
     pub const HIP_RES_VIEW_FORMAT_SINT_4X8: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         6,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 1 channel unsigned 16-bit integers
+    ///< 1 channel, unsigned 16-bit integers
     pub const HIP_RES_VIEW_FORMAT_UINT_1X16: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         7,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 2 channel unsigned 16-bit integers
+    ///< 2 channels, unsigned 16-bit integers
     pub const HIP_RES_VIEW_FORMAT_UINT_2X16: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         8,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 4 channel unsigned 16-bit integers
+    ///< 4 channels, unsigned 16-bit integers
     pub const HIP_RES_VIEW_FORMAT_UINT_4X16: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         9,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 1 channel signed 16-bit integers
+    ///< 1 channel, signed 16-bit integers
     pub const HIP_RES_VIEW_FORMAT_SINT_1X16: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         10,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 2 channel signed 16-bit integers
+    ///< 2 channels, signed 16-bit integers
     pub const HIP_RES_VIEW_FORMAT_SINT_2X16: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         11,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 4 channel signed 16-bit integers
+    ///< 4 channels, signed 16-bit integers
     pub const HIP_RES_VIEW_FORMAT_SINT_4X16: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         12,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 1 channel unsigned 32-bit integers
+    ///< 1 channel, unsigned 32-bit integers
     pub const HIP_RES_VIEW_FORMAT_UINT_1X32: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         13,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 2 channel unsigned 32-bit integers
+    ///< 2 channels, unsigned 32-bit integers
     pub const HIP_RES_VIEW_FORMAT_UINT_2X32: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         14,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 4 channel unsigned 32-bit integers
+    ///< 4 channels, unsigned 32-bit integers
     pub const HIP_RES_VIEW_FORMAT_UINT_4X32: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         15,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 1 channel signed 32-bit integers
+    ///< 1 channel, signed 32-bit integers
     pub const HIP_RES_VIEW_FORMAT_SINT_1X32: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         16,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 2 channel signed 32-bit integers
+    ///< 2 channels, signed 32-bit integers
     pub const HIP_RES_VIEW_FORMAT_SINT_2X32: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         17,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 4 channel signed 32-bit integers
+    ///< 4 channels, signed 32-bit integers
     pub const HIP_RES_VIEW_FORMAT_SINT_4X32: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         18,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 1 channel 16-bit floating point
+    ///< 1 channel, 16-bit floating point
     pub const HIP_RES_VIEW_FORMAT_FLOAT_1X16: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         19,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 2 channel 16-bit floating point
+    ///< 2 channels, 16-bit floating point
     pub const HIP_RES_VIEW_FORMAT_FLOAT_2X16: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         20,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 4 channel 16-bit floating point
+    ///< 4 channels, 16-bit floating point
     pub const HIP_RES_VIEW_FORMAT_FLOAT_4X16: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         21,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 1 channel 32-bit floating point
+    ///< 1 channel, 32-bit floating point
     pub const HIP_RES_VIEW_FORMAT_FLOAT_1X32: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         22,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 2 channel 32-bit floating point
+    ///< 2 channels, 32-bit floating point
     pub const HIP_RES_VIEW_FORMAT_FLOAT_2X32: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         23,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< 4 channel 32-bit floating point
+    ///< 4 channels, 32-bit floating point
     pub const HIP_RES_VIEW_FORMAT_FLOAT_4X32: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         24,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< Block compressed 1
+    ///< Block-compressed 1
     pub const HIP_RES_VIEW_FORMAT_UNSIGNED_BC1: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         25,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< Block compressed 2
+    ///< Block-compressed 2
     pub const HIP_RES_VIEW_FORMAT_UNSIGNED_BC2: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         26,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< Block compressed 3
+    ///< Block-compressed 3
     pub const HIP_RES_VIEW_FORMAT_UNSIGNED_BC3: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         27,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< Block compressed 4 unsigned
+    ///< Block-compressed 4 unsigned
     pub const HIP_RES_VIEW_FORMAT_UNSIGNED_BC4: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         28,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< Block compressed 4 signed
+    ///< Block-compressed 4 signed
     pub const HIP_RES_VIEW_FORMAT_SIGNED_BC4: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         29,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< Block compressed 5 unsigned
+    ///< Block-compressed 5 unsigned
     pub const HIP_RES_VIEW_FORMAT_UNSIGNED_BC5: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         30,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< Block compressed 5 signed
+    ///< Block-compressed 5 signed
     pub const HIP_RES_VIEW_FORMAT_SIGNED_BC5: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         31,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< Block compressed 6 unsigned half-float
+    ///< Block-compressed 6 unsigned half-float
     pub const HIP_RES_VIEW_FORMAT_UNSIGNED_BC6H: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         32,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< Block compressed 6 signed half-float
+    ///< Block-compressed 6 signed half-float
     pub const HIP_RES_VIEW_FORMAT_SIGNED_BC6H: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         33,
     );
 }
 impl HIPresourceViewFormat_enum {
-    ///< Block compressed 7
+    ///< Block-compressed 7
     pub const HIP_RES_VIEW_FORMAT_UNSIGNED_BC7: HIPresourceViewFormat_enum = HIPresourceViewFormat_enum(
         34,
     );
 }
 #[repr(transparent)]
+/// HIP texture resource view formats
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct HIPresourceViewFormat_enum(pub ::core::ffi::c_uint);
+/// HIP texture resource view formats
 pub use self::HIPresourceViewFormat_enum as HIPresourceViewFormat;
 /// HIP resource descriptor
 #[repr(C)]
 #[derive(Copy, Clone)]
 pub struct hipResourceDesc {
+    ///< Resource type
     pub resType: hipResourceType,
     pub res: hipResourceDesc__bindgen_ty_1,
 }
@@ -2216,29 +2548,40 @@ pub union hipResourceDesc__bindgen_ty_1 {
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct hipResourceDesc__bindgen_ty_1__bindgen_ty_1 {
+    ///< HIP array
     pub array: hipArray_t,
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct hipResourceDesc__bindgen_ty_1__bindgen_ty_2 {
+    ///< HIP mipmapped array
     pub mipmap: hipMipmappedArray_t,
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct hipResourceDesc__bindgen_ty_1__bindgen_ty_3 {
+    ///< Device pointer
     pub devPtr: *mut ::core::ffi::c_void,
+    ///< Channel format description
     pub desc: hipChannelFormatDesc,
+    ///< Size in bytes
     pub sizeInBytes: usize,
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct hipResourceDesc__bindgen_ty_1__bindgen_ty_4 {
+    ///< Device pointer
     pub devPtr: *mut ::core::ffi::c_void,
+    ///< Channel format description
     pub desc: hipChannelFormatDesc,
+    ///< Width of the array in elements
     pub width: usize,
+    ///< Height of the array in elements
     pub height: usize,
+    ///< Pitch between two rows in bytes
     pub pitchInBytes: usize,
 }
+/// HIP resource view descriptor struct
 #[repr(C)]
 #[derive(Copy, Clone)]
 pub struct HIP_RESOURCE_DESC_st {
@@ -2302,18 +2645,27 @@ pub struct HIP_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_4 {
 pub struct HIP_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_5 {
     pub reserved: [::core::ffi::c_int; 32usize],
 }
+/// HIP resource view descriptor struct
 pub type HIP_RESOURCE_DESC = HIP_RESOURCE_DESC_st;
-/// hip resource view descriptor
+/// HIP resource view descriptor
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct hipResourceViewDesc {
+    ///< Resource view format
     pub format: hipResourceViewFormat,
+    ///< Width of the resource view
     pub width: usize,
+    ///< Height of the resource view
     pub height: usize,
+    ///< Depth of the resource view
     pub depth: usize,
+    ///< First defined mipmap level
     pub firstMipmapLevel: ::core::ffi::c_uint,
+    ///< Last defined mipmap level
     pub lastMipmapLevel: ::core::ffi::c_uint,
+    ///< First layer index
     pub firstLayer: ::core::ffi::c_uint,
+    ///< Last layer index
     pub lastLayer: ::core::ffi::c_uint,
 }
 /// Resource view descriptor
@@ -2368,14 +2720,20 @@ impl hipMemcpyKind {
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct hipMemcpyKind(pub ::core::ffi::c_uint);
+/// HIP pithed pointer
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct hipPitchedPtr {
+    ///< Pointer to the allocated memory
     pub ptr: *mut ::core::ffi::c_void,
+    ///< Pitch in bytes
     pub pitch: usize,
+    ///< Logical size of the first dimension of allocation in elements
     pub xsize: usize,
+    ///< Logical size of the second dimension of allocation in elements
     pub ysize: usize,
 }
+/// HIP extent
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct hipExtent {
@@ -2383,98 +2741,145 @@ pub struct hipExtent {
     pub height: usize,
     pub depth: usize,
 }
+///  HIP position
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct hipPos {
+    ///< X coordinate
     pub x: usize,
+    ///< Y coordinate
     pub y: usize,
+    ///< Z coordinate
     pub z: usize,
 }
+/// HIP 3D memory copy parameters
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct hipMemcpy3DParms {
+    ///< Source array
     pub srcArray: hipArray_t,
+    ///< Source position
     pub srcPos: hipPos,
+    ///< Source pointer
     pub srcPtr: hipPitchedPtr,
+    ///< Destination array
     pub dstArray: hipArray_t,
+    ///< Destination position
     pub dstPos: hipPos,
+    ///< Destination pointer
     pub dstPtr: hipPitchedPtr,
+    ///< Extent of 3D memory copy
     pub extent: hipExtent,
+    ///< Kind of 3D memory copy
     pub kind: hipMemcpyKind,
 }
+/// HIP 3D memory copy
 #[repr(C)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct HIP_MEMCPY3D {
+    ///< Source X in bytes
     pub srcXInBytes: usize,
+    ///< Source Y
     pub srcY: usize,
+    ///< Source Z
     pub srcZ: usize,
+    ///< Source LOD
     pub srcLOD: usize,
+    ///< Source memory type
     pub srcMemoryType: hipMemoryType,
+    ///< Source host pointer
     pub srcHost: *const ::core::ffi::c_void,
+    ///< Source device
     pub srcDevice: hipDeviceptr_t,
+    ///< Source array
     pub srcArray: hipArray_t,
+    ///< Source pitch
     pub srcPitch: usize,
+    ///< Source height
     pub srcHeight: usize,
+    ///< Destination X in bytes
     pub dstXInBytes: usize,
+    ///< Destination Y
     pub dstY: usize,
+    ///< Destination Z
     pub dstZ: usize,
+    ///< Destination LOD
     pub dstLOD: usize,
+    ///< Destination memory type
     pub dstMemoryType: hipMemoryType,
+    ///< Destination host pointer
     pub dstHost: *mut ::core::ffi::c_void,
+    ///< Destination device
     pub dstDevice: hipDeviceptr_t,
+    ///< Destination array
     pub dstArray: hipArray_t,
+    ///< Destination pitch
     pub dstPitch: usize,
+    ///< Destination height
     pub dstHeight: usize,
+    ///< Width in bytes of 3D memory copy
     pub WidthInBytes: usize,
+    ///< Height in bytes of 3D memory copy
     pub Height: usize,
+    ///< Depth in bytes of 3D memory copy
     pub Depth: usize,
 }
 impl hipFunction_attribute {
+    ///< The maximum number of threads per block. Depends on function and device.
     pub const HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: hipFunction_attribute = hipFunction_attribute(
         0,
     );
 }
 impl hipFunction_attribute {
+    ///< The statically allocated shared memory size in bytes per block required by the function.
     pub const HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: hipFunction_attribute = hipFunction_attribute(
         1,
     );
 }
 impl hipFunction_attribute {
+    ///< The user-allocated constant memory by the function in bytes.
     pub const HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: hipFunction_attribute = hipFunction_attribute(
         2,
     );
 }
 impl hipFunction_attribute {
+    ///< The local memory usage of each thread by this function in bytes.
     pub const HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: hipFunction_attribute = hipFunction_attribute(
         3,
     );
 }
 impl hipFunction_attribute {
+    ///< The number of registers used by each thread of this function.
     pub const HIP_FUNC_ATTRIBUTE_NUM_REGS: hipFunction_attribute = hipFunction_attribute(
         4,
     );
 }
 impl hipFunction_attribute {
+    ///< PTX version
     pub const HIP_FUNC_ATTRIBUTE_PTX_VERSION: hipFunction_attribute = hipFunction_attribute(
         5,
     );
 }
 impl hipFunction_attribute {
+    ///< Binary version
     pub const HIP_FUNC_ATTRIBUTE_BINARY_VERSION: hipFunction_attribute = hipFunction_attribute(
         6,
     );
 }
 impl hipFunction_attribute {
+    ///< Cache mode
     pub const HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA: hipFunction_attribute = hipFunction_attribute(
         7,
     );
 }
 impl hipFunction_attribute {
+    ///< The maximum dynamic shared memory per block for this function in bytes.
     pub const HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: hipFunction_attribute = hipFunction_attribute(
         8,
     );
 }
 impl hipFunction_attribute {
+    ///< The shared memory carveout preference in percent of the maximum shared memory.
     pub const HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: hipFunction_attribute = hipFunction_attribute(
         9,
     );
@@ -2487,32 +2892,32 @@ impl hipFunction_attribute {
 pub struct hipFunction_attribute(pub ::core::ffi::c_uint);
 impl hipPointer_attribute {
     /**< The context on which a pointer was allocated
-< @warning - not supported in HIP*/
+< @warning This attribute is not supported in HIP*/
     pub const HIP_POINTER_ATTRIBUTE_CONTEXT: hipPointer_attribute = hipPointer_attribute(
         1,
     );
 }
 impl hipPointer_attribute {
-    ///< memory type describing location of a pointer
+    ///< memory type describing the location of a pointer
     pub const HIP_POINTER_ATTRIBUTE_MEMORY_TYPE: hipPointer_attribute = hipPointer_attribute(
         2,
     );
 }
 impl hipPointer_attribute {
-    ///< address at which the pointer is allocated on device
+    ///< address at which the pointer is allocated on the device
     pub const HIP_POINTER_ATTRIBUTE_DEVICE_POINTER: hipPointer_attribute = hipPointer_attribute(
         3,
     );
 }
 impl hipPointer_attribute {
-    ///< address at which the pointer is allocated on host
+    ///< address at which the pointer is allocated on the host
     pub const HIP_POINTER_ATTRIBUTE_HOST_POINTER: hipPointer_attribute = hipPointer_attribute(
         4,
     );
 }
 impl hipPointer_attribute {
-    /**< A pair of tokens for use with linux kernel interface
-< @warning - not supported in HIP*/
+    /**< A pair of tokens for use with Linux kernel interface
+< @warning This attribute is not supported in HIP*/
     pub const HIP_POINTER_ATTRIBUTE_P2P_TOKENS: hipPointer_attribute = hipPointer_attribute(
         5,
     );
@@ -2546,7 +2951,7 @@ impl hipPointer_attribute {
 impl hipPointer_attribute {
     /**< if this pointer maps to an allocation
 < that is suitable for hipIpcGetMemHandle
-< @warning - not supported in HIP*/
+< @warning This attribute is not supported in HIP*/
     pub const HIP_POINTER_ATTRIBUTE_IS_LEGACY_HIP_IPC_CAPABLE: hipPointer_attribute = hipPointer_attribute(
         10,
     );
@@ -2572,7 +2977,7 @@ impl hipPointer_attribute {
 }
 impl hipPointer_attribute {
     /**< Bitmask of allowed hipmemAllocationHandleType
-< for this allocation @warning - not supported in HIP*/
+< for this allocation @warning This attribute is not supported in HIP*/
     pub const HIP_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES: hipPointer_attribute = hipPointer_attribute(
         14,
     );
@@ -2580,7 +2985,7 @@ impl hipPointer_attribute {
 impl hipPointer_attribute {
     /**< returns if the memory referenced by
 < this pointer can be used with the GPUDirect RDMA API
-< @warning - not supported in HIP*/
+< @warning This attribute is not supported in HIP*/
     pub const HIP_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE: hipPointer_attribute = hipPointer_attribute(
         15,
     );
@@ -2595,7 +3000,7 @@ impl hipPointer_attribute {
 impl hipPointer_attribute {
     /**< Returns the mempool handle for the allocation if
 < it was allocated from a mempool
-< @warning - not supported in HIP*/
+< @warning This attribute is not supported in HIP*/
     pub const HIP_POINTER_ATTRIBUTE_MEMPOOL_HANDLE: hipPointer_attribute = hipPointer_attribute(
         17,
     );
@@ -2772,6 +3177,12 @@ pub struct ihipModuleSymbol_t {
 pub struct hipFunction_t(pub *mut ihipModuleSymbol_t);
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
+pub struct ihipLinkState_t {
+    _unused: [u8; 0],
+}
+pub type hipLinkState_t = *mut ihipLinkState_t;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
 pub struct ihipMemPoolHandle_t {
     _unused: [u8; 0],
 }
@@ -2824,6 +3235,197 @@ impl hipLimit_t {
  enum is treated as "UnsupportedLimit" by default.*/
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct hipLimit_t(pub ::core::ffi::c_uint);
+impl hipStreamBatchMemOpType {
+    pub const hipStreamMemOpWaitValue32: hipStreamBatchMemOpType = hipStreamBatchMemOpType(
+        1,
+    );
+}
+impl hipStreamBatchMemOpType {
+    pub const hipStreamMemOpWriteValue32: hipStreamBatchMemOpType = hipStreamBatchMemOpType(
+        2,
+    );
+}
+impl hipStreamBatchMemOpType {
+    pub const hipStreamMemOpWaitValue64: hipStreamBatchMemOpType = hipStreamBatchMemOpType(
+        4,
+    );
+}
+impl hipStreamBatchMemOpType {
+    pub const hipStreamMemOpWriteValue64: hipStreamBatchMemOpType = hipStreamBatchMemOpType(
+        5,
+    );
+}
+impl hipStreamBatchMemOpType {
+    ///< Currently not supported
+    pub const hipStreamMemOpBarrier: hipStreamBatchMemOpType = hipStreamBatchMemOpType(
+        6,
+    );
+}
+impl hipStreamBatchMemOpType {
+    ///< Currently not supported
+    pub const hipStreamMemOpFlushRemoteWrites: hipStreamBatchMemOpType = hipStreamBatchMemOpType(
+        3,
+    );
+}
+#[repr(transparent)]
+/// Operations for hipStreamBatchMemOp
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct hipStreamBatchMemOpType(pub ::core::ffi::c_uint);
+/** @brief Union representing batch memory operation parameters for HIP streams.
+
+ hipStreamBatchMemOpParams is used to specify the parameters for batch memory
+ operations in a HIP stream. This union supports various operations including
+ waiting for a specific value, writing a value, and different flags for wait conditions.
+
+ @details
+ The union includes fields for different types of operations defined in the
+ enum hipStreamBatchMemOpType:
+ - hipStreamMemOpWaitValue32:  Wait for a 32-bit value.
+ - hipStreamMemOpWriteValue32: Write a 32-bit value.
+ - hipStreamMemOpWaitValue64:  Wait for a 64-bit value.
+ - hipStreamMemOpWriteValue64: Write a 64-bit value.
+
+ Each operation type includes an address, the value to wait for or write, flags, and an
+ optional alias that is not relevant on AMD GPUs. Flags can be used to specify different
+ wait conditions such as equality, bitwise AND, greater than or equal, and bitwise NOR.
+
+ Example usage:
+ @code
+ hipStreamBatchMemOpParams myArray[2];
+ myArray[0].operation = hipStreamMemOpWaitValue32;
+ myArray[0].waitValue.address = waitAddr1;
+ myArray[0].waitValue.value = 0x1;
+ myArray[0].waitValue.flags = CU_STREAM_WAIT_VALUE_EQ;
+
+ myArray[1].operation = hipStreamMemOpWriteValue32;
+ myArray[1].writeValue.address = writeAddr1;
+ myArray[1].writeValue.value = 0x1;
+ myArray[1].writeValue.flags = 0x0;
+
+ result = hipStreamBatchMemOp(stream, 2, myArray, 0);
+ @endcode*/
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub union hipStreamBatchMemOpParams_union {
+    pub operation: hipStreamBatchMemOpType,
+    pub waitValue: hipStreamBatchMemOpParams_union_hipStreamMemOpWaitValueParams_t,
+    pub writeValue: hipStreamBatchMemOpParams_union_hipStreamMemOpWriteValueParams_t,
+    ///< Currently not supported on AMD
+    pub flushRemoteWrites: hipStreamBatchMemOpParams_union_hipStreamMemOpFlushRemoteWritesParams_t,
+    ///< Currently not supported on AMD
+    pub memoryBarrier: hipStreamBatchMemOpParams_union_hipStreamMemOpMemoryBarrierParams_t,
+    pub pad: [u64; 6usize],
+}
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub struct hipStreamBatchMemOpParams_union_hipStreamMemOpWaitValueParams_t {
+    pub operation: hipStreamBatchMemOpType,
+    pub address: hipDeviceptr_t,
+    pub __bindgen_anon_1: hipStreamBatchMemOpParams_union_hipStreamMemOpWaitValueParams_t__bindgen_ty_1,
+    pub flags: ::core::ffi::c_uint,
+    ///< Not valid for AMD backend. Initial value is unimportant
+    pub alias: hipDeviceptr_t,
+}
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub union hipStreamBatchMemOpParams_union_hipStreamMemOpWaitValueParams_t__bindgen_ty_1 {
+    pub value: u32,
+    pub value64: u64,
+}
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub struct hipStreamBatchMemOpParams_union_hipStreamMemOpWriteValueParams_t {
+    pub operation: hipStreamBatchMemOpType,
+    pub address: hipDeviceptr_t,
+    pub __bindgen_anon_1: hipStreamBatchMemOpParams_union_hipStreamMemOpWriteValueParams_t__bindgen_ty_1,
+    pub flags: ::core::ffi::c_uint,
+    ///< Not valid for AMD backend. Initial value is unimportant
+    pub alias: hipDeviceptr_t,
+}
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub union hipStreamBatchMemOpParams_union_hipStreamMemOpWriteValueParams_t__bindgen_ty_1 {
+    pub value: u32,
+    pub value64: u64,
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct hipStreamBatchMemOpParams_union_hipStreamMemOpFlushRemoteWritesParams_t {
+    pub operation: hipStreamBatchMemOpType,
+    pub flags: ::core::ffi::c_uint,
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct hipStreamBatchMemOpParams_union_hipStreamMemOpMemoryBarrierParams_t {
+    pub operation: hipStreamBatchMemOpType,
+    pub flags: ::core::ffi::c_uint,
+}
+/** @brief Union representing batch memory operation parameters for HIP streams.
+
+ hipStreamBatchMemOpParams is used to specify the parameters for batch memory
+ operations in a HIP stream. This union supports various operations including
+ waiting for a specific value, writing a value, and different flags for wait conditions.
+
+ @details
+ The union includes fields for different types of operations defined in the
+ enum hipStreamBatchMemOpType:
+ - hipStreamMemOpWaitValue32:  Wait for a 32-bit value.
+ - hipStreamMemOpWriteValue32: Write a 32-bit value.
+ - hipStreamMemOpWaitValue64:  Wait for a 64-bit value.
+ - hipStreamMemOpWriteValue64: Write a 64-bit value.
+
+ Each operation type includes an address, the value to wait for or write, flags, and an
+ optional alias that is not relevant on AMD GPUs. Flags can be used to specify different
+ wait conditions such as equality, bitwise AND, greater than or equal, and bitwise NOR.
+
+ Example usage:
+ @code
+ hipStreamBatchMemOpParams myArray[2];
+ myArray[0].operation = hipStreamMemOpWaitValue32;
+ myArray[0].waitValue.address = waitAddr1;
+ myArray[0].waitValue.value = 0x1;
+ myArray[0].waitValue.flags = CU_STREAM_WAIT_VALUE_EQ;
+
+ myArray[1].operation = hipStreamMemOpWriteValue32;
+ myArray[1].writeValue.address = writeAddr1;
+ myArray[1].writeValue.value = 0x1;
+ myArray[1].writeValue.flags = 0x0;
+
+ result = hipStreamBatchMemOp(stream, 2, myArray, 0);
+ @endcode*/
+pub type hipStreamBatchMemOpParams = hipStreamBatchMemOpParams_union;
+/** @brief Structure representing node parameters for batch memory operations in HIP graphs.
+
+ hipBatchMemOpNodeParams is used to specify the parameters for batch memory
+ operations in HIP graphs. This struct includes the context to use for the operations, the
+ number of operations, and an array of hipStreamBatchMemOpParams that describe the operations.
+
+ @details
+ The structure includes the following fields:
+ - ctx: The HIP context to use for the operations.
+ - count: The number of operations in the paramArray.
+ - paramArray: A pointer to an array of hipStreamBatchMemOpParams.
+ - flags: Flags to control the node.
+
+ Example usage:
+ @code
+ hipBatchMemOpNodeParams nodeParams;
+ nodeParams.ctx = context;
+ nodeParams.count = ARRAY_SIZE;
+ nodeParams.paramArray = myArray;
+ nodeParams.flags = 0;
+
+ Pass nodeParams to a HIP graph APIs hipGraphAddBatchMemOpNode, hipGraphBatchMemOpNodeGetParams,
+ hipGraphBatchMemOpNodeSetParams, hipGraphExecBatchMemOpNodeSetParams
+ @endcode*/
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct hipBatchMemOpNodeParams {
+    pub ctx: hipCtx_t,
+    pub count: ::core::ffi::c_uint,
+    pub paramArray: *mut hipStreamBatchMemOpParams,
+    pub flags: ::core::ffi::c_uint,
+}
 impl hipMemoryAdvise {
     /**< Data will mostly be read and only occassionally
 < be written to*/
@@ -3114,64 +3716,6 @@ pub struct hipMemPoolProps {
 pub struct hipMemPoolPtrExportData {
     pub reserved: [::core::ffi::c_uchar; 64usize],
 }
-impl hipJitOption {
-    pub const hipJitOptionMaxRegisters: hipJitOption = hipJitOption(0);
-}
-impl hipJitOption {
-    pub const hipJitOptionThreadsPerBlock: hipJitOption = hipJitOption(1);
-}
-impl hipJitOption {
-    pub const hipJitOptionWallTime: hipJitOption = hipJitOption(2);
-}
-impl hipJitOption {
-    pub const hipJitOptionInfoLogBuffer: hipJitOption = hipJitOption(3);
-}
-impl hipJitOption {
-    pub const hipJitOptionInfoLogBufferSizeBytes: hipJitOption = hipJitOption(4);
-}
-impl hipJitOption {
-    pub const hipJitOptionErrorLogBuffer: hipJitOption = hipJitOption(5);
-}
-impl hipJitOption {
-    pub const hipJitOptionErrorLogBufferSizeBytes: hipJitOption = hipJitOption(6);
-}
-impl hipJitOption {
-    pub const hipJitOptionOptimizationLevel: hipJitOption = hipJitOption(7);
-}
-impl hipJitOption {
-    pub const hipJitOptionTargetFromContext: hipJitOption = hipJitOption(8);
-}
-impl hipJitOption {
-    pub const hipJitOptionTarget: hipJitOption = hipJitOption(9);
-}
-impl hipJitOption {
-    pub const hipJitOptionFallbackStrategy: hipJitOption = hipJitOption(10);
-}
-impl hipJitOption {
-    pub const hipJitOptionGenerateDebugInfo: hipJitOption = hipJitOption(11);
-}
-impl hipJitOption {
-    pub const hipJitOptionLogVerbose: hipJitOption = hipJitOption(12);
-}
-impl hipJitOption {
-    pub const hipJitOptionGenerateLineInfo: hipJitOption = hipJitOption(13);
-}
-impl hipJitOption {
-    pub const hipJitOptionCacheMode: hipJitOption = hipJitOption(14);
-}
-impl hipJitOption {
-    pub const hipJitOptionSm3xOpt: hipJitOption = hipJitOption(15);
-}
-impl hipJitOption {
-    pub const hipJitOptionFastCompile: hipJitOption = hipJitOption(16);
-}
-impl hipJitOption {
-    pub const hipJitOptionNumOptions: hipJitOption = hipJitOption(17);
-}
-#[repr(transparent)]
-/// hipJitOption
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct hipJitOption(pub ::core::ffi::c_uint);
 impl hipFuncAttribute {
     pub const hipFuncAttributeMaxDynamicSharedMemorySize: hipFuncAttribute = hipFuncAttribute(
         8,
@@ -3244,9 +3788,9 @@ pub struct dim3 {
 pub struct hipLaunchParams_t {
     ///< Device function symbol
     pub func: *mut ::core::ffi::c_void,
-    ///< Grid dimentions
+    ///< Grid dimensions
     pub gridDim: dim3,
-    ///< Block dimentions
+    ///< Block dimensions
     pub blockDim: dim3,
     ///< Arguments
     pub args: *mut *mut ::core::ffi::c_void,
@@ -3638,7 +4182,11 @@ impl hipGraphNodeType {
     pub const hipGraphNodeTypeMemcpyToSymbol: hipGraphNodeType = hipGraphNodeType(13);
 }
 impl hipGraphNodeType {
-    pub const hipGraphNodeTypeCount: hipGraphNodeType = hipGraphNodeType(14);
+    ///< BatchMemOp node
+    pub const hipGraphNodeTypeBatchMemOp: hipGraphNodeType = hipGraphNodeType(14);
+}
+impl hipGraphNodeType {
+    pub const hipGraphNodeTypeCount: hipGraphNodeType = hipGraphNodeType(15);
 }
 #[repr(transparent)]
 /// hipGraphNodeType
@@ -3679,10 +4227,10 @@ pub struct hipMemAllocNodeParams {
     /**< Pool properties, which contain where
 < the location should reside*/
     pub poolProps: hipMemPoolProps,
-    /**< The number of memory access descriptors.
-< Must not be bigger than the number of GPUs*/
+    ///< The number of memory access descriptors.
     pub accessDescs: *const hipMemAccessDesc,
-    ///< The number of access descriptors
+    /**< The number of access descriptors.
+< Must not be bigger than the number of GPUs*/
     pub accessDescCount: usize,
     ///< The size of the requested allocation in bytes
     pub bytesize: usize,
@@ -4314,13 +4862,15 @@ extern "C" {
 
  @param [out] driverVersion driver version
 
+ HIP driver version shows up in the format:
+ HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH.
+
  @returns #hipSuccess, #hipErrorInvalidValue
 
- @warning The HIP feature set does not correspond to an exact CUDA SDK driver revision.
- This function always set *driverVersion to 4 as an approximation though HIP supports
- some features which were introduced in later CUDA SDK revisions.
- HIP apps code should not rely on the driver revision number here and should
- use arch feature flags to test device capabilities or conditional compilation.
+ @warning The HIP driver version does not correspond to an exact CUDA driver revision.
+ On AMD platform, the API returns the HIP driver version, while on NVIDIA platform, it calls
+ the corresponding CUDA runtime API and returns the CUDA driver version.
+ There is no mapping/correlation between HIP driver version and CUDA driver version.
 
  @see hipRuntimeGetVersion*/
     pub fn hipDriverGetVersion(driverVersion: *mut ::core::ffi::c_int) -> hipError_t;
@@ -4387,8 +4937,8 @@ extern "C" {
  @param [out] uuid UUID for the device
  @param [in] device device ordinal
 
- @warning This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
  @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue, #hipErrorNotInitialized,
  #hipErrorDeinitialized*/
@@ -4568,8 +5118,8 @@ extern "C" {
  @see hipDeviceGetDefaultMemPool, hipMallocAsync, hipMemPoolTrimTo, hipMemPoolGetAttribute,
  hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipDeviceGetDefaultMemPool(
         mem_pool: *mut hipMemPool_t,
         device: ::core::ffi::c_int,
@@ -4594,8 +5144,8 @@ extern "C" {
  @see hipDeviceGetDefaultMemPool, hipMallocAsync, hipMemPoolTrimTo, hipMemPoolGetAttribute,
  hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipDeviceSetMemPool(
         device: ::core::ffi::c_int,
         mem_pool: hipMemPool_t,
@@ -4618,8 +5168,8 @@ extern "C" {
  @see hipDeviceGetDefaultMemPool, hipMallocAsync, hipMemPoolTrimTo, hipMemPoolGetAttribute,
  hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipDeviceGetMemPool(
         mem_pool: *mut hipMemPool_t,
         device: ::core::ffi::c_int,
@@ -4632,7 +5182,7 @@ extern "C" {
  @param [out] prop written with device properties
  @param [in]  deviceId which device to query for information
 
- @return #hipSuccess, #hipErrorInvalidDevice
+ @returns #hipSuccess, #hipErrorInvalidDevice
  @bug HCC always returns 0 for maxThreadsPerMultiProcessor
  @bug HCC always returns 0 for regsPerBlock
  @bug HCC always returns 0 for l2CacheSize
@@ -4643,6 +5193,25 @@ extern "C" {
         deviceId: ::core::ffi::c_int,
     ) -> hipError_t;
 }
+extern "C" {
+    #[must_use]
+    /** @brief Gets the maximum width for 1D linear textures on the specified device
+
+ This function queries the maximum width, in elements, of 1D linear textures that can be allocated
+ on the specified device. The maximum width depends on the texture element size and the hardware
+ limitations of the device.
+
+ @param [out] max_width Maximum width, in elements, of 1D linear textures that the device can support
+ @param [in] device     Device index to query for maximum 1D texture width
+
+ @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDevice
+
+ @see hipDeviceGetAttribute, hipMalloc, hipTexRefSetAddressMode*/
+    pub fn hipDeviceGetTexture1DLinearMaxWidth(
+        mem_pool: *mut hipMemPool_t,
+        device: ::core::ffi::c_int,
+    ) -> hipError_t;
+}
 extern "C" {
     #[must_use]
     /** @brief Set L1/Shared cache partition.
@@ -4741,26 +5310,33 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief The current device behavior is changed according the flags passed.
+    /** @brief The current device behavior is changed according to the flags passed.
 
  @param [in] flags Flag to set on the current device
 
  The schedule flags impact how HIP waits for the completion of a command running on a device.
- hipDeviceScheduleSpin         : HIP runtime will actively spin in the thread which submitted the
- work until the command completes.  This offers the lowest latency, but will consume a CPU core
- and may increase power. hipDeviceScheduleYield        : The HIP runtime will yield the CPU to
- system so that other tasks can use it.  This may increase latency to detect the completion but
- will consume less power and is friendlier to other tasks in the system.
- hipDeviceScheduleBlockingSync : On ROCm platform, this is a synonym for hipDeviceScheduleYield.
- hipDeviceScheduleAuto         : Use a hueristic to select between Spin and Yield modes.  If the
- number of HIP contexts is greater than the number of logical processors in the system, use Spin
- scheduling.  Else use Yield scheduling.
 
+ #hipDeviceScheduleSpin         : HIP runtime will actively spin in the thread which submitted
+ the work until the command completes.  This offers the lowest latency, but will consume a CPU
+ core and may increase power.
 
- hipDeviceMapHost              : Allow mapping host memory.  On ROCM, this is always allowed and
- the flag is ignored. hipDeviceLmemResizeToMax      : @warning ROCm silently ignores this flag.
+ #hipDeviceScheduleYield        : The HIP runtime will yield the CPU to system so that other
+ tasks can use it. This may increase latency to detect the completion but will consume less
+ power and is friendlier to other tasks in the system.
 
- @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorSetOnActiveProcess
+ #hipDeviceScheduleBlockingSync : On ROCm platform, this is a synonym for hipDeviceScheduleYield.
+
+ #hipDeviceScheduleAuto         : This is the default value if the input 'flags' is zero.
+ Uses a heuristic to select between Spin and Yield modes. If the number of HIP contexts is
+ greater than the number of logical processors in the system, uses Spin scheduling, otherwise
+ uses Yield scheduling.
+
+ #hipDeviceMapHost              : Allows mapping host memory. On ROCm, this is always allowed and
+ the flag is ignored.
+
+ #hipDeviceLmemResizeToMax      : This flag is silently ignored on ROCm.
+
+ @returns #hipSuccess, #hipErrorNoDevice, #hipErrorInvalidDevice, #hipErrorSetOnActiveProcess
 
 */
     pub fn hipSetDeviceFlags(flags: ::core::ffi::c_uint) -> hipError_t;
@@ -4799,22 +5375,18 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Gets an interprocess memory handle for an existing device memory
-          allocation
+    /** @brief Gets an interprocess memory handle for an existing device memory allocation.
 
- Takes a pointer to the base of an existing device memory allocation created
- with hipMalloc and exports it for use in another process. This is a
- lightweight operation and may be called multiple times on an allocation
- without adverse effects.
+ Takes a pointer to the base of an existing device memory allocation created with ::hipMalloc
+ and exports it for use in another process. This is a lightweight operation and may be called
+ multiple times on an allocation without adverse effects.
 
- If a region of memory is freed with hipFree and a subsequent call
- to hipMalloc returns memory with the same device address,
- hipIpcGetMemHandle will return a unique handle for the
- new memory.
+ If a region of memory is freed with ::hipFree and a subsequent call to ::hipMalloc returns
+ memory with the same device address, ::hipIpcGetMemHandle will return a unique handle for
+ the new memory.
 
- @param handle - Pointer to user allocated hipIpcMemHandle to return
-                    the handle in.
- @param devPtr - Base pointer to previously allocated device memory
+ @param handle - Pointer to user allocated hipIpcMemHandle to return the handle in.
+ @param devPtr - Base pointer to previously allocated device memory.
 
  @returns #hipSuccess, #hipErrorInvalidHandle, #hipErrorOutOfMemory, #hipErrorMapFailed
 
@@ -4827,37 +5399,34 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Opens an interprocess memory handle exported from another process
-          and returns a device pointer usable in the local process.
+    /** @brief Opens an interprocess memory handle exported from another process and returns a device
+ pointer usable in the local process.
 
- Maps memory exported from another process with hipIpcGetMemHandle into
- the current device address space. For contexts on different devices
- hipIpcOpenMemHandle can attempt to enable peer access between the
- devices as if the user called hipDeviceEnablePeerAccess. This behavior is
- controlled by the hipIpcMemLazyEnablePeerAccess flag.
- hipDeviceCanAccessPeer can determine if a mapping is possible.
+ Maps memory exported from another process with ::hipIpcGetMemHandle into the current device
+ address space. For contexts on different devices ::hipIpcOpenMemHandle can attempt to enable
+ peer access between the devices like the user called ::hipDeviceEnablePeerAccess.
+ This behavior is controlled by the flag #hipIpcMemLazyEnablePeerAccess.
+ The API ::hipDeviceCanAccessPeer can determine if a mapping is possible.
 
- Contexts that may open hipIpcMemHandles are restricted in the following way.
- hipIpcMemHandles from each device in a given process may only be opened
- by one context per device per other process.
+ hipIpcMemHandles from each device in a given process may only be opened by one context per
+ device per other process.
 
- Memory returned from hipIpcOpenMemHandle must be freed with
- hipIpcCloseMemHandle.
+ Memory returned from ::hipIpcOpenMemHandle must be freed with ::hipIpcCloseMemHandle.
 
- Calling hipFree on an exported memory region before calling
- hipIpcCloseMemHandle in the importing context will result in undefined
- behavior.
+ Calling ::hipFree on an exported memory region before calling ::hipIpcCloseMemHandle in the
+ importing context will result in undefined behavior.
 
  @param devPtr - Returned device pointer
  @param handle - hipIpcMemHandle to open
  @param flags  - Flags for this operation. Must be specified as hipIpcMemLazyEnablePeerAccess
 
  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidContext,
-  #hipErrorInvalidDevicePointer
+ #hipErrorInvalidDevicePointer
 
  @note During multiple processes, using the same memory handle opened by the current context,
- there is no guarantee that the same device poiter will be returned in @p *devPtr.
+ there is no guarantee that the same device pointer will be returned in @p *devPtr.
  This is diffrent from CUDA.
+
  @note This IPC memory related feature API on Windows may behave differently from Linux.
 */
     pub fn hipIpcOpenMemHandle(
@@ -4868,16 +5437,14 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Close memory mapped with hipIpcOpenMemHandle
+    /** @brief Close memory mapped with ::hipIpcOpenMemHandle
 
- Unmaps memory returnd by hipIpcOpenMemHandle. The original allocation
- in the exporting process as well as imported mappings in other processes
- will be unaffected.
+ Unmaps memory returned by ::hipIpcOpenMemHandle. The original allocation in the exporting
+ process as well as imported mappings in other processes will be unaffected.
 
- Any resources used to enable peer access will be freed if this is the
- last mapping using them.
+ Any resources used to enable peer access will be freed if this is the last mapping using them.
 
- @param devPtr - Device pointer returned by hipIpcOpenMemHandle
+ @param devPtr - Device pointer returned by ::hipIpcOpenMemHandle
 
  @returns #hipSuccess, #hipErrorMapFailed, #hipErrorInvalidHandle
 
@@ -4889,13 +5456,14 @@ extern "C" {
     #[must_use]
     /** @brief Gets an opaque interprocess handle for an event.
 
- This opaque handle may be copied into other processes and opened with hipIpcOpenEventHandle.
- Then hipEventRecord, hipEventSynchronize, hipStreamWaitEvent and hipEventQuery may be used in
- either process. Operations on the imported event after the exported event has been freed with hipEventDestroy
- will result in undefined behavior.
+ The event is previously allocated with #hipEventInterprocess and #hipEventDisableTiming flags.
+ The opaque interprocess handle may be copied into other processes and opened with
+ ::hipIpcOpenEventHandle. Then ::hipEventRecord, ::hipEventSynchronize, ::hipStreamWaitEvent and
+ ::hipEventQuery may be used in either process. After the exported event has been freed with
+ ::hipEventDestroy, operations on the imported event will result in undefined behavior.
 
- @param[out]  handle Pointer to hipIpcEventHandle to return the opaque event handle
- @param[in]   event  Event allocated with hipEventInterprocess and hipEventDisableTiming flags
+ @param[out]  handle Pointer to #hipIpcEventHandle to return the opaque event handle
+ @param[in]   event  Event allocated with #hipEventInterprocess and #hipEventDisableTiming flags
 
  @returns #hipSuccess, #hipErrorInvalidConfiguration, #hipErrorInvalidValue
 
@@ -4908,15 +5476,16 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Opens an interprocess event handles.
+    /** @brief Opens an interprocess event handle.
 
- Opens an interprocess event handle exported from another process with hipIpcGetEventHandle. The returned
- hipEvent_t behaves like a locally created event with the hipEventDisableTiming flag specified. This event
- need be freed with hipEventDestroy. Operations on the imported event after the exported event has been freed
- with hipEventDestroy will result in undefined behavior. If the function is called within the same process where
- handle is returned by hipIpcGetEventHandle, it will return hipErrorInvalidContext.
+ Opens an interprocess event handle exported from another process with ::hipIpcGetEventHandle.
+ The returned #hipEvent_t behaves like a locally created event with the #hipEventDisableTiming
+ flag specified. This event needs be freed with ::hipEventDestroy. After the exported event
+ has been freed with ::hipEventDestroy, operations on the imported event will result in
+ undefined behavior. If the input handle is from the same process, it will return
+ #hipErrorInvalidContext.
 
- @param[out]  event  Pointer to hipEvent_t to return the event
+ @param[out]  event  Pointer to hipEvent_t to return the imported event
  @param[in]   handle The opaque interprocess handle to open
 
  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidContext
@@ -4992,7 +5561,7 @@ extern "C" {
     #[must_use]
     /** @brief Return last error returned by any HIP runtime API call.
 
- @return #hipSuccess
+ @returns #hipSuccess
 
  Returns the last error that has been returned by any of the runtime calls in the same host
  thread. Unlike hipGetLastError, this function does not reset the saved error code.
@@ -5004,7 +5573,7 @@ extern "C" {
     /** @brief Return hip error as text string form.
 
  @param hip_error Error code to convert to name.
- @return const char pointer to the NULL-terminated error name
+ @returns const char pointer to the NULL-terminated error name
 
  @see hipGetErrorString, hipGetLastError, hipPeakAtLastError, hipError_t*/
     pub fn hipGetErrorName(hip_error: hipError_t) -> *const ::core::ffi::c_char;
@@ -5013,7 +5582,7 @@ extern "C" {
     /** @brief Return handy text string message to explain the error which occurred
 
  @param hipError Error code to convert to string.
- @return const char pointer to the NULL-terminated error string
+ @returns const char pointer to the NULL-terminated error string
 
  @see hipGetErrorName, hipGetLastError, hipPeakAtLastError, hipError_t*/
     pub fn hipGetErrorString(hipError: hipError_t) -> *const ::core::ffi::c_char;
@@ -5024,7 +5593,7 @@ extern "C" {
 
  @param [in] hipError Error code to convert to string.
  @param [out] errorString char pointer to the NULL-terminated error string
- @return #hipSuccess, #hipErrorInvalidValue
+ @returns #hipSuccess, #hipErrorInvalidValue
 
  @see hipGetErrorName, hipGetLastError, hipPeakAtLastError, hipError_t*/
     pub fn hipDrvGetErrorName(
@@ -5038,7 +5607,7 @@ extern "C" {
 
  @param [in] hipError Error code to convert to string.
  @param [out] errorString char pointer to the NULL-terminated error string
- @return #hipSuccess, #hipErrorInvalidValue
+ @returns #hipSuccess, #hipErrorInvalidValue
 
  @see hipGetErrorName, hipGetLastError, hipPeakAtLastError, hipError_t*/
     pub fn hipDrvGetErrorString(
@@ -5052,14 +5621,14 @@ extern "C" {
 
  @param[in, out] stream Valid pointer to hipStream_t.  This function writes the memory with the
  newly created stream.
- @return #hipSuccess, #hipErrorInvalidValue
+ @returns #hipSuccess, #hipErrorInvalidValue
 
  Create a new asynchronous stream.  @p stream returns an opaque handle that can be used to
  reference the newly created stream in subsequent hipStream* commands.  The stream is allocated on
  the heap and will remain allocated even if the handle goes out-of-scope.  To release the memory
  used by the stream, application must call hipStreamDestroy.
 
- @return #hipSuccess, #hipErrorInvalidValue
+ @returns #hipSuccess, #hipErrorInvalidValue
 
  @see hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy*/
     pub fn hipStreamCreate(stream: *mut hipStream_t) -> hipError_t;
@@ -5070,7 +5639,7 @@ extern "C" {
 
  @param[in, out] stream Pointer to new stream
  @param[in ] flags to control stream creation.
- @return #hipSuccess, #hipErrorInvalidValue
+ @returns #hipSuccess, #hipErrorInvalidValue
 
  Create a new asynchronous stream.  @p stream returns an opaque handle that can be used to
  reference the newly created stream in subsequent hipStream* commands.  The stream is allocated on
@@ -5092,7 +5661,7 @@ extern "C" {
  @param[in, out] stream Pointer to new stream
  @param[in ] flags to control stream creation.
  @param[in ] priority of the stream. Lower numbers represent higher priorities.
- @return #hipSuccess, #hipErrorInvalidValue
+ @returns #hipSuccess, #hipErrorInvalidValue
 
  Create a new asynchronous stream with the specified priority.  @p stream returns an opaque handle
  that can be used to reference the newly created stream in subsequent hipStream* commands.  The
@@ -5132,7 +5701,7 @@ extern "C" {
     /** @brief Destroys the specified stream.
 
  @param[in] stream stream identifier.
- @return #hipSuccess #hipErrorInvalidHandle
+ @returns #hipSuccess #hipErrorInvalidHandle
 
  Destroys the specified stream.
 
@@ -5153,7 +5722,7 @@ extern "C" {
 
  @param[in] stream stream to query
 
- @return #hipSuccess, #hipErrorNotReady, #hipErrorInvalidHandle
+ @returns #hipSuccess, #hipErrorNotReady, #hipErrorInvalidHandle
 
  This is thread-safe and returns a snapshot of the current state of the queue.  However, if other
  host threads are sending work to the stream, the status may change immediately after the function
@@ -5169,7 +5738,7 @@ extern "C" {
 
  @param[in] stream stream identifier.
 
- @return #hipSuccess, #hipErrorInvalidHandle
+ @returns #hipSuccess, #hipErrorInvalidHandle
 
  This command is host-synchronous : the host will block until the specified stream is empty.
 
@@ -5193,7 +5762,7 @@ extern "C" {
  @param[in] event event to wait on
  @param[in] flags control operation [must be 0]
 
- @return #hipSuccess, #hipErrorInvalidHandle
+ @returns #hipSuccess, #hipErrorInvalidHandle
 
  This function inserts a wait operation into the specified stream.
  All future work submitted to @p stream will wait until @p event reports completion before
@@ -5216,7 +5785,7 @@ extern "C" {
 
  @param[in] stream stream to be queried
  @param[in,out] flags Pointer to an unsigned integer in which the stream's flags are returned
- @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidHandle
+ @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidHandle
 
  @returns #hipSuccess #hipErrorInvalidValue #hipErrorInvalidHandle
 
@@ -5234,7 +5803,7 @@ extern "C" {
 
  @param[in] stream stream to be queried
  @param[in,out] priority Pointer to an unsigned integer in which the stream's priority is returned
- @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidHandle
+ @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidHandle
 
  @returns #hipSuccess #hipErrorInvalidValue #hipErrorInvalidHandle
 
@@ -5252,7 +5821,7 @@ extern "C" {
 
  @param[in] stream stream to be queried
  @param[out] device device associated with the stream
- @return #hipSuccess, #hipErrorInvalidValue, #hipErrorContextIsDestroyed, #hipErrorInvalidHandle,
+ @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorContextIsDestroyed, #hipErrorInvalidHandle,
  #hipErrorNotInitialized, #hipErrorDeinitialized, #hipErrorInvalidContext
 
  @see hipStreamCreate, hipStreamDestroy, hipDeviceGetStreamPriorityRange*/
@@ -5271,7 +5840,7 @@ extern "C" {
  The first 32 bits represent the first 32 CUs, and so on. If its size is greater than physical
  CU number (i.e., multiProcessorCount member of hipDeviceProp_t), the extra elements are ignored.
  It is user's responsibility to make sure the input is meaningful.
- @return #hipSuccess, #hipErrorInvalidHandle, #hipErrorInvalidValue
+ @returns #hipSuccess, #hipErrorInvalidHandle, #hipErrorInvalidValue
 
  Create a new asynchronous stream with the specified CU mask.  @p stream returns an opaque handle
  that can be used to reference the newly created stream in subsequent hipStream* commands.  The
@@ -5295,7 +5864,7 @@ extern "C" {
  @param[out] cuMask Pointer to a pre-allocated block of memories (uint32_t *) in which
  the stream's CU mask is returned. The CU mask is returned in a chunck of 32 bits where
  each active bit represents one active CU
- @return #hipSuccess, #hipErrorInvalidHandle, #hipErrorInvalidValue
+ @returns #hipSuccess, #hipErrorInvalidHandle, #hipErrorInvalidValue
 
  @see hipStreamCreate, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy*/
     pub fn hipExtStreamGetCUMask(
@@ -5322,7 +5891,7 @@ extern "C" {
  @param[in] callback - The function to call once preceding stream operations are complete
  @param[in] userData - User specified data to be passed to the callback function
  @param[in] flags    - Reserved for future use, must be 0
- @return #hipSuccess, #hipErrorInvalidHandle, #hipErrorNotSupported
+ @returns #hipSuccess, #hipErrorInvalidHandle, #hipErrorNotSupported
 
  @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamQuery, hipStreamSynchronize,
  hipStreamWaitEvent, hipStreamDestroy, hipStreamCreateWithPriority
@@ -5336,7 +5905,39 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    #[doc = " @}\n/\n/**\n-------------------------------------------------------------------------------------------------\n-------------------------------------------------------------------------------------------------\n  @defgroup StreamM Stream Memory Operations\n  @{\n  This section describes Stream Memory Wait and Write functions of HIP runtime API.\n/\n/**\n @brief Enqueues a wait command to the stream.[BETA]\n\n @param [in] stream - Stream identifier\n @param [in] ptr    - Pointer to memory object allocated using 'hipMallocSignalMemory' flag\n @param [in] value  - Value to be used in compare operation\n @param [in] flags  - Defines the compare operation, supported values are hipStreamWaitValueGte\n hipStreamWaitValueEq, hipStreamWaitValueAnd and hipStreamWaitValueNor\n @param [in] mask   - Mask to be applied on value at memory before it is compared with value,\n default value is set to enable every bit\n\n @returns #hipSuccess, #hipErrorInvalidValue\n\n Enqueues a wait command to the stream, all operations enqueued  on this stream after this, will\n not execute until the defined wait condition is true.\n\n hipStreamWaitValueGte: waits until *ptr&mask >= value\n hipStreamWaitValueEq : waits until *ptr&mask == value\n hipStreamWaitValueAnd: waits until ((*ptr&mask) & value) != 0\n hipStreamWaitValueNor: waits until ~((*ptr&mask) | (value&mask)) != 0\n\n @note when using 'hipStreamWaitValueNor', mask is applied on both 'value' and '*ptr'.\n\n @note Support for hipStreamWaitValue32 can be queried using 'hipDeviceGetAttribute()' and\n 'hipDeviceAttributeCanUseStreamWaitValue' flag.\n\n @warning This API is marked as beta, meaning, while this is feature complete,\n it is still open to changes and may have outstanding issues.\n\n @see hipExtMallocWithFlags, hipFree, hipStreamWaitValue64, hipStreamWriteValue64,\n hipStreamWriteValue32, hipDeviceGetAttribute"]
+    /** @brief Enqueues a wait command to the stream.[BETA]
+
+ @param [in] stream - Stream identifier
+ @param [in] ptr    - Pointer to memory object allocated using #hipMallocSignalMemory flag
+ @param [in] value  - Value to be used in compare operation
+ @param [in] flags  - Defines the compare operation, supported values are #hipStreamWaitValueGte
+ #hipStreamWaitValueEq, #hipStreamWaitValueAnd and #hipStreamWaitValueNor
+ @param [in] mask   - Mask to be applied on value at memory before it is compared with value,
+ default value is set to enable every bit
+
+ @returns #hipSuccess, #hipErrorInvalidValue
+
+ Enqueues a wait command to the stream, all operations enqueued  on this stream after this, will
+ not execute until the defined wait condition is true.
+
+ #hipStreamWaitValueGte: waits until *ptr&mask >= value
+
+ #hipStreamWaitValueEq : waits until *ptr&mask == value
+
+ #hipStreamWaitValueAnd: waits until ((*ptr&mask) & value) != 0
+
+ #hipStreamWaitValueNor: waits until ~((*ptr&mask) | (value&mask)) != 0
+
+ @note when using #hipStreamWaitValueNor, mask is applied on both 'value' and '*ptr'.
+
+ @note Support for #hipStreamWaitValue32 can be queried using 'hipDeviceGetAttribute()' and
+ 'hipDeviceAttributeCanUseStreamWaitValue' flag.
+
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
+
+ @see hipExtMallocWithFlags, hipFree, hipStreamWaitValue64, hipStreamWriteValue64,
+ hipStreamWriteValue32, hipDeviceGetAttribute*/
     pub fn hipStreamWaitValue32(
         stream: hipStream_t,
         ptr: *mut ::core::ffi::c_void,
@@ -5352,8 +5953,8 @@ extern "C" {
  @param [in] stream - Stream identifier
  @param [in] ptr    - Pointer to memory object allocated using 'hipMallocSignalMemory' flag
  @param [in] value  - Value to be used in compare operation
- @param [in] flags  - Defines the compare operation, supported values are hipStreamWaitValueGte
- hipStreamWaitValueEq, hipStreamWaitValueAnd and hipStreamWaitValueNor.
+ @param [in] flags  - Defines the compare operation, supported values are #hipStreamWaitValueGte
+ #hipStreamWaitValueEq, #hipStreamWaitValueAnd and #hipStreamWaitValueNor.
  @param [in] mask   - Mask to be applied on value at memory before it is compared with value
  default value is set to enable every bit
 
@@ -5362,18 +5963,21 @@ extern "C" {
  Enqueues a wait command to the stream, all operations enqueued  on this stream after this, will
  not execute until the defined wait condition is true.
 
- hipStreamWaitValueGte: waits until *ptr&mask >= value
- hipStreamWaitValueEq : waits until *ptr&mask == value
- hipStreamWaitValueAnd: waits until ((*ptr&mask) & value) != 0
- hipStreamWaitValueNor: waits until ~((*ptr&mask) | (value&mask)) != 0
+ #hipStreamWaitValueGte: waits until *ptr&mask >= value
 
- @note when using 'hipStreamWaitValueNor', mask is applied on both 'value' and '*ptr'.
+ #hipStreamWaitValueEq : waits until *ptr&mask == value
+
+ #hipStreamWaitValueAnd: waits until ((*ptr&mask) & value) != 0
+
+ #hipStreamWaitValueNor: waits until ~((*ptr&mask) | (value&mask)) != 0
+
+ @note when using #hipStreamWaitValueNor, mask is applied on both 'value' and '*ptr'.
 
  @note Support for hipStreamWaitValue64 can be queried using 'hipDeviceGetAttribute()' and
  'hipDeviceAttributeCanUseStreamWaitValue' flag.
 
- @warning This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
  @see hipExtMallocWithFlags, hipFree, hipStreamWaitValue32, hipStreamWriteValue64,
  hipStreamWriteValue32, hipDeviceGetAttribute*/
@@ -5399,8 +6003,8 @@ extern "C" {
  Enqueues a write command to the stream, write operation is performed after all earlier commands
  on this stream have completed the execution.
 
- @warning This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
  @see hipExtMallocWithFlags, hipFree, hipStreamWriteValue32, hipStreamWaitValue32,
  hipStreamWaitValue64*/
@@ -5425,8 +6029,8 @@ extern "C" {
  Enqueues a write command to the stream, write operation is performed after all earlier commands
  on this stream have completed the execution.
 
- @warning This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
  @see hipExtMallocWithFlags, hipFree, hipStreamWriteValue32, hipStreamWaitValue32,
  hipStreamWaitValue64*/
@@ -5437,6 +6041,126 @@ extern "C" {
         flags: ::core::ffi::c_uint,
     ) -> hipError_t;
 }
+extern "C" {
+    #[must_use]
+    /** @brief Enqueues an array of stream memory operations in the stream.[BETA]
+
+ @param [in] stream      - Stream identifier
+ @param [in] count       - The number of operations in the array. Must be less than 256
+ @param [in] paramArray  - The types and parameters of the individual operations.
+ @param [in] flags       - Reserved for future expansion; must be 0.
+
+ @returns #hipSuccess, #hipErrorInvalidValue
+
+ Batch operations to synchronize the stream via memory operations.
+
+ @warning This API is marked as beta, meaning, while this is feature complete,
+ it is still open to changes and may have outstanding issues.
+
+ @see hipStreamWriteValue32, hipStreamWaitValue32,
+ hipStreamWaitValue64. hipStreamWriteValue64*/
+    pub fn hipStreamBatchMemOp(
+        stream: hipStream_t,
+        count: ::core::ffi::c_uint,
+        paramArray: *mut hipStreamBatchMemOpParams,
+        flags: ::core::ffi::c_uint,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[must_use]
+    /** @brief Creates a batch memory operation node and adds it to a graph.[BETA]
+
+ @param [in] phGraphNode      - Returns the newly created node
+ @param [in] hGraph           - Graph to which to add the node
+ @param [in] dependencies     -  Dependencies of the node
+ @param [in] numDependencies  - Number of dependencies
+ @param [in] nodeParams       - Parameters for the node
+
+ @returns #hipSuccess, #hipErrorInvalidValue
+
+ @warning This API is marked as beta, meaning, while this is feature complete,
+ it is still open to changes and may have outstanding issues.
+
+ @see hipStreamWriteValue32, hipStreamWaitValue32,
+ hipStreamWaitValue64. hipStreamWriteValue64, hipStreamBatchMemOp*/
+    pub fn hipGraphAddBatchMemOpNode(
+        phGraphNode: *mut hipGraphNode_t,
+        hGraph: hipGraph_t,
+        dependencies: *const hipGraphNode_t,
+        numDependencies: usize,
+        nodeParams: *const hipBatchMemOpNodeParams,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[must_use]
+    /** @brief Returns a batch mem op node's parameters.[BETA]
+
+ @param [in] hNode           - Node to get the parameters for
+ @param [in] nodeParams_out  - Pointer to return the parameters
+
+ @returns #hipSuccess, #hipErrorInvalidValue
+
+ Returns the parameters of batch mem op node hNode in nodeParams_out.
+ The paramArray returned in nodeParams_out is owned by the node.
+ This memory remains valid until the node is destroyed or its parameters are modified,
+ and should not be modified directly.
+
+ @warning This API is marked as beta, meaning, while this is feature complete,
+ it is still open to changes and may have outstanding issues.
+
+ @see hipStreamWriteValue32, hipStreamWaitValue32,
+ hipStreamWaitValue64. hipStreamWriteValue64. hipGraphBatchMemOpNodeSetParams*/
+    pub fn hipGraphBatchMemOpNodeGetParams(
+        hNode: hipGraphNode_t,
+        nodeParams_out: *mut hipBatchMemOpNodeParams,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[must_use]
+    /** @brief Sets the batch mem op node's parameters.[BETA]
+
+ @param [in] hNode       - Node to set the parameters for
+ @param [in] nodeParams  - Parameters to copy
+
+ @returns #hipSuccess, #hipErrorInvalidValue
+
+ Sets the parameters of batch mem op node hNode to nodeParams.
+
+ @warning This API is marked as beta, meaning, while this is feature complete,
+ it is still open to changes and may have outstanding issues.
+
+ @see hipStreamWriteValue32, hipStreamWaitValue32,
+ hipStreamWaitValue64. hipStreamWriteValue64, hipGraphBatchMemOpNodeGetParams*/
+    pub fn hipGraphBatchMemOpNodeSetParams(
+        hNode: hipGraphNode_t,
+        nodeParams: *mut hipBatchMemOpNodeParams,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[must_use]
+    /** @brief Sets the parameters for a batch mem op node in the given graphExec.[BETA]
+
+ @param [in] hGraphExec  - The executable graph in which to set the specified node
+ @param [in] hNode       - Batch mem op node from the graph from which graphExec was instantiated
+ @param [in] nodeParams  - Updated Parameters to set
+
+ @returns #hipSuccess, #hipErrorInvalidValue
+
+ Sets the parameters of a batch mem op node in an executable graph hGraphExec.
+ The node is identified by the corresponding node hNode in the non-executable graph,
+ from which the executable graph was instantiated.
+
+ @warning This API is marked as beta, meaning, while this is feature complete,
+ it is still open to changes and may have outstanding issues.
+
+ @see hipStreamWriteValue32, hipStreamWaitValue32,
+ hipStreamWaitValue64. hipStreamWriteValue64, hipStreamBatchMemOp*/
+    pub fn hipGraphExecBatchMemOpNodeSetParams(
+        hGraphExec: hipGraphExec_t,
+        hNode: hipGraphNode_t,
+        nodeParams: *const hipBatchMemOpNodeParams,
+    ) -> hipError_t;
+}
 extern "C" {
     #[must_use]
     #[doc = " @}\n/\n/**\n-------------------------------------------------------------------------------------------------\n-------------------------------------------------------------------------------------------------\n  @defgroup Event Event Management\n  @{\n  This section describes the event management functions of HIP runtime API.\n/\n/**\n @brief Create an event with the specified flags\n\n @param[in,out] event Returns the newly created event.\n @param[in] flags     Flags to control event behavior.  Valid values are #hipEventDefault,\n#hipEventBlockingSync, #hipEventDisableTiming, #hipEventInterprocess\n #hipEventDefault : Default flag.  The event will use active synchronization and will support\ntiming.  Blocking synchronization provides lowest possible latency at the expense of dedicating a\nCPU to poll on the event.\n #hipEventBlockingSync : The event will use blocking synchronization : if hipEventSynchronize is\ncalled on this event, the thread will block until the event completes.  This can increase latency\nfor the synchroniation but can result in lower power and more resources for other CPU threads.\n #hipEventDisableTiming : Disable recording of timing information. Events created with this flag\nwould not record profiling data and provide best performance if used for synchronization.\n #hipEventInterprocess : The event can be used as an interprocess event. hipEventDisableTiming\nflag also must be set when hipEventInterprocess flag is set.\n #hipEventDisableSystemFence : Disable acquire and release system scope fence. This may\nimprove performance but device memory may not be visible to the host and other devices\nif this flag is set.\n\n @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue,\n#hipErrorLaunchFailure, #hipErrorOutOfMemory\n\n @see hipEventCreate, hipEventSynchronize, hipEventDestroy, hipEventElapsedTime"]
@@ -5458,6 +6182,48 @@ extern "C" {
  hipEventDestroy, hipEventElapsedTime*/
     pub fn hipEventCreate(event: *mut hipEvent_t) -> hipError_t;
 }
+extern "C" {
+    #[must_use]
+    /** @brief Record an event in the specified stream.
+
+ @param[in] event event to record.
+ @param[in] stream stream in which to record event.
+ @param[in] flags parameter for operations
+ @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized,
+ #hipErrorInvalidHandle, #hipErrorLaunchFailure
+
+ hipEventQuery() or hipEventSynchronize() must be used to determine when the event
+ transitions from "recording" (after hipEventRecord() is called) to "recorded"
+ (when timestamps are set, if requested).
+
+ Events which are recorded in a non-NULL stream will transition to
+ from recording to "recorded" state when they reach the head of
+ the specified stream, after all previous
+ commands in that stream have completed executing.
+
+ Flags include:
+   hipEventRecordDefault: Default event creation flag.
+   hipEventRecordExternal: Event is captured in the graph as an external event node when
+                           performing stream capture
+
+ If hipEventRecord() has been previously called on this event, then this call will overwrite any
+ existing state in event.
+
+ If this function is called on an event that is currently being recorded, results are undefined
+ - either outstanding recording may save state into the event, and the order is not guaranteed.
+
+ @note: If this function is not called before use hipEventQuery() or hipEventSynchronize(),
+ #hipSuccess is returned, meaning no pending event in the stream.
+
+ @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventSynchronize,
+ hipEventDestroy, hipEventElapsedTime
+*/
+    pub fn hipEventRecordWithFlags(
+        event: hipEvent_t,
+        stream: hipStream_t,
+        flags: ::core::ffi::c_uint,
+    ) -> hipError_t;
+}
 extern "C" {
     #[must_use]
     pub fn hipEventRecord(event: hipEvent_t, stream: hipStream_t) -> hipError_t;
@@ -5546,7 +6312,7 @@ extern "C" {
  commands in the appropriate stream (specified to hipEventRecord()) have completed.  If any execution
  has not completed, then #hipErrorNotReady is returned.
 
- @note: This API returns #hipSuccess, if hipEventRecord() is not called before this API.
+ @note This API returns #hipSuccess, if hipEventRecord() is not called before this API.
 
  @see hipEventCreate, hipEventCreateWithFlags, hipEventRecord, hipEventDestroy,
  hipEventSynchronize, hipEventElapsedTime*/
@@ -5560,10 +6326,10 @@ extern "C" {
   @param [in]      attribute  Attribute to set
   @param [in]      ptr      Pointer to set attributes for
 
-  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+  @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
 
-  @warning This API is marked as beta, meaning, while this is feature complete,
-  it is still open to changes and may have outstanding issues.
+  @warning This API is marked as Beta. While this feature is complete, it can
+           change and might have outstanding issues.
 */
     pub fn hipPointerSetAttribute(
         value: *const ::core::ffi::c_void,
@@ -5585,7 +6351,7 @@ extern "C" {
   @note  The unrecognized memory type is unsupported to keep the HIP functionality backward
   compatibility due to #hipMemoryType enum values.
 
-  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+  @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
 
   @note  The current behavior of this HIP API corresponds to the CUDA API before version 11.0.
 
@@ -5603,10 +6369,10 @@ extern "C" {
   @param [in]      attribute  Attribute to query for
   @param [in]      ptr      Pointer to get attributes for
 
-  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+  @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
 
-  @warning This API is marked as beta, meaning, while this is feature complete,
-  it is still open to changes and may have outstanding issues.
+  @warning This API is marked as Beta. While this feature is complete, it can
+           change and might have outstanding issues.
 
   @see hipPointerGetAttributes*/
     pub fn hipPointerGetAttribute(
@@ -5625,10 +6391,10 @@ extern "C" {
                                where the result of each attribute query will be written to
   @param [in]  ptr             pointer to get attributes for
 
-  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+  @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
 
-  @warning This API is marked as beta, meaning, while this is feature complete,
-  it is still open to changes and may have outstanding issues.
+  @warning This API is marked as Beta. While this feature is complete, it can
+           change and might have outstanding issues.
 
   @see hipPointerGetAttribute*/
     pub fn hipDrvPointerGetAttributes(
@@ -5640,7 +6406,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    #[doc = "-------------------------------------------------------------------------------------------------\n-------------------------------------------------------------------------------------------------\n  @defgroup External External Resource Interoperability\n  @{\n  @ingroup API\n\n  This section describes the external resource interoperability functions of HIP runtime API.\n\n/\n/**\n  @brief Imports an external semaphore.\n\n  @param[out] extSem_out  External semaphores to be waited on\n  @param[in] semHandleDesc Semaphore import handle descriptor\n\n  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue\n\n  @see"]
+    #[doc = "-------------------------------------------------------------------------------------------------\n-------------------------------------------------------------------------------------------------\n  @defgroup External External Resource Interoperability\n  @{\n  @ingroup API\n\n  This section describes the external resource interoperability functions of HIP runtime API.\n\n/\n/**\n  @brief Imports an external semaphore.\n\n  @param[out] extSem_out  External semaphores to be waited on\n  @param[in] semHandleDesc Semaphore import handle descriptor\n\n  @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue\n\n  @see\n\n  @note  This API is currently not supported on Linux.\n"]
     pub fn hipImportExternalSemaphore(
         extSem_out: *mut hipExternalSemaphore_t,
         semHandleDesc: *const hipExternalSemaphoreHandleDesc,
@@ -5655,9 +6421,12 @@ extern "C" {
   @param[in] numExtSems Number of semaphores to wait on
   @param[in] stream Stream to enqueue the wait operations in
 
-  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+  @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
 
-  @see*/
+  @see
+
+  @note  This API is currently not supported on Linux.
+*/
     pub fn hipSignalExternalSemaphoresAsync(
         extSemArray: *const hipExternalSemaphore_t,
         paramsArray: *const hipExternalSemaphoreSignalParams,
@@ -5674,9 +6443,12 @@ extern "C" {
   @param[in] numExtSems Number of semaphores to wait on
   @param[in] stream Stream to enqueue the wait operations in
 
-  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+  @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
 
-  @see*/
+  @see
+
+  @note  This API is currently not supported on Linux.
+*/
     pub fn hipWaitExternalSemaphoresAsync(
         extSemArray: *const hipExternalSemaphore_t,
         paramsArray: *const hipExternalSemaphoreWaitParams,
@@ -5690,9 +6462,12 @@ extern "C" {
 
   @param[in] extSem handle to an external memory object
 
-  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+  @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
 
-  @see*/
+  @see
+
+  @note  This API is currently not supported on Linux.
+*/
     pub fn hipDestroyExternalSemaphore(extSem: hipExternalSemaphore_t) -> hipError_t;
 }
 extern "C" {
@@ -5702,7 +6477,7 @@ extern "C" {
   @param[out] extMem_out  Returned handle to an external memory object
   @param[in]  memHandleDesc Memory import handle descriptor
 
-  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+  @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
 
   @see*/
     pub fn hipImportExternalMemory(
@@ -5718,7 +6493,7 @@ extern "C" {
   @param[in]  extMem  Handle to external memory object
   @param[in]  bufferDesc  Buffer descriptor
 
-  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+  @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
 
   @see*/
     pub fn hipExternalMemoryGetMappedBuffer(
@@ -5748,7 +6523,7 @@ extern "C" {
 
   Returned mipmapped array must be freed using hipFreeMipmappedArray.
 
-  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidResourceHandle
+  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidResourceHandle
 
   @see hipImportExternalMemory, hipDestroyExternalMemory, hipExternalMemoryGetMappedBuffer, hipFreeMipmappedArray*/
     pub fn hipExternalMemoryGetMappedMipmappedArray(
@@ -5759,7 +6534,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    #[doc = " @}\n/\n/**\n  @brief Allocate memory on the default accelerator\n\n  @param[out] ptr Pointer to the allocated memory\n  @param[in]  size Requested memory size\n\n  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.\n\n  @return #hipSuccess, #hipErrorOutOfMemory, #hipErrorInvalidValue (bad context, null *ptr)\n\n  @see hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray,\n hipHostFree, hipHostMalloc"]
+    #[doc = " @}\n/\n/**\n  @brief Allocate memory on the default accelerator\n\n  @param[out] ptr Pointer to the allocated memory\n  @param[in]  size Requested memory size\n\n  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.\n\n  @returns #hipSuccess, #hipErrorOutOfMemory, #hipErrorInvalidValue (bad context, null *ptr)\n\n  @see hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray,\n hipHostFree, hipHostMalloc"]
     pub fn hipMalloc(ptr: *mut *mut ::core::ffi::c_void, size: usize) -> hipError_t;
 }
 extern "C" {
@@ -5777,10 +6552,10 @@ extern "C" {
   #hipDeviceMallocFinegrained, #hipDeviceMallocUncached, or #hipMallocSignalMemory.
   If the flag is any other value, the API returns #hipErrorInvalidValue.
 
-  @return #hipSuccess, #hipErrorOutOfMemory, #hipErrorInvalidValue (bad context, null *ptr)
+  @returns #hipSuccess, #hipErrorOutOfMemory, #hipErrorInvalidValue (bad context, null *ptr)
 
   @see hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray,
- hipHostFree, hipHostMalloc*/
+ hipHostFree, hiHostMalloc*/
     pub fn hipExtMallocWithFlags(
         ptr: *mut *mut ::core::ffi::c_void,
         sizeBytes: usize,
@@ -5796,7 +6571,7 @@ extern "C" {
 
   If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
 
-  @return #hipSuccess, #hipErrorOutOfMemory
+  @returns #hipSuccess, #hipErrorOutOfMemory
 
   @warning  This API is deprecated, use hipHostMalloc() instead*/
     pub fn hipMallocHost(ptr: *mut *mut ::core::ffi::c_void, size: usize) -> hipError_t;
@@ -5810,7 +6585,7 @@ extern "C" {
 
   If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
 
-  @return #hipSuccess, #hipErrorOutOfMemory
+  @returns #hipSuccess, #hipErrorOutOfMemory
 
   @warning  This API is deprecated, use hipHostMalloc() instead*/
     pub fn hipMemAllocHost(
@@ -5842,13 +6617,15 @@ extern "C" {
   @param[out] ptr Pointer to the allocated host pinned memory
   @param[in]  size Requested memory size in bytes
   If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
-  @param[in]  flags Type of host memory allocation
+  @param[in]  flags Type of host memory allocation. See the description of flags in
+  hipSetDeviceFlags.
 
   If no input for flags, it will be the default pinned memory allocation on the host.
 
-  @return #hipSuccess, #hipErrorOutOfMemory
+  @returns #hipSuccess, #hipErrorOutOfMemory
 
-  @see hipSetDeviceFlags, hipHostFree*/
+
+  @see hipSetDeviceFlags, hiptHostFree*/
     pub fn hipHostMalloc(
         ptr: *mut *mut ::core::ffi::c_void,
         size: usize,
@@ -5875,7 +6652,7 @@ extern "C" {
 
  @returns #hipSuccess, #hipErrorInvalidValue
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemPrefetchAsync(
         dev_ptr: *const ::core::ffi::c_void,
         count: usize,
@@ -5901,7 +6678,7 @@ extern "C" {
  be aligned to CPU page size, the same way as corresponding CUDA API behaves in CUDA version 8.0
  and afterwards.
 
- @note  This API is implemented on Linux and is under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemAdvise(
         dev_ptr: *const ::core::ffi::c_void,
         count: usize,
@@ -5922,7 +6699,7 @@ extern "C" {
 
  @returns #hipSuccess, #hipErrorInvalidValue
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemRangeGetAttribute(
         data: *mut ::core::ffi::c_void,
         data_size: usize,
@@ -5946,7 +6723,7 @@ extern "C" {
 
  @returns #hipSuccess, #hipErrorInvalidValue
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemRangeGetAttributes(
         data: *mut *mut ::core::ffi::c_void,
         data_sizes: *mut usize,
@@ -5969,7 +6746,8 @@ extern "C" {
 
  @returns #hipSuccess, #hipErrorInvalidValue
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @warning This API is under development. Currently it is a no-operation (NOP)
+          function on AMD GPUs and returns #hipSuccess.*/
     pub fn hipStreamAttachMemAsync(
         stream: hipStream_t,
         dev_ptr: *mut ::core::ffi::c_void,
@@ -5999,15 +6777,15 @@ extern "C" {
  @param [in] stream    The stream establishing the stream ordering contract and
                        the memory pool to allocate from
 
- @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported, #hipErrorOutOfMemory
+ @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported, #hipErrorOutOfMemory
 
  @see hipMallocFromPoolAsync, hipFreeAsync, hipMemPoolTrimTo, hipMemPoolGetAttribute,
  hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMallocAsync(
         dev_ptr: *mut *mut ::core::ffi::c_void,
         size: usize,
@@ -6034,10 +6812,10 @@ extern "C" {
  @see hipMallocFromPoolAsync, hipMallocAsync, hipMemPoolTrimTo, hipMemPoolGetAttribute,
  hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipFreeAsync(
         dev_ptr: *mut ::core::ffi::c_void,
         stream: hipStream_t,
@@ -6052,8 +6830,8 @@ extern "C" {
  The allocator cannot release OS allocations that back outstanding asynchronous allocations.
  The OS allocations may happen at different granularity from the user allocations.
 
- @note: Allocations that have not been freed count as outstanding.
- @note: Allocations that have been asynchronously freed but whose completion has
+ @note Allocations that have not been freed count as outstanding.
+ @note Allocations that have been asynchronously freed but whose completion has
  not been observed on the host (eg. by a synchronize) can count as outstanding.
 
  @param[in] mem_pool          The memory pool to trim allocations
@@ -6066,10 +6844,10 @@ extern "C" {
  @see hipMallocFromPoolAsync, hipMallocAsync, hipFreeAsync, hipMemPoolGetAttribute,
  hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemPoolTrimTo(
         mem_pool: hipMemPool_t,
         min_bytes_to_hold: usize,
@@ -6109,10 +6887,10 @@ extern "C" {
  @see hipMallocFromPoolAsync, hipMallocAsync, hipFreeAsync, hipMemPoolGetAttribute,
  hipMemPoolTrimTo, hipDeviceSetMemPool, hipMemPoolSetAccess, hipMemPoolGetAccess
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemPoolSetAttribute(
         mem_pool: hipMemPool_t,
         attr: hipMemPoolAttr,
@@ -6153,10 +6931,10 @@ extern "C" {
  @see hipMallocFromPoolAsync, hipMallocAsync, hipFreeAsync,
  hipMemPoolTrimTo, hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemPoolGetAttribute(
         mem_pool: hipMemPool_t,
         attr: hipMemPoolAttr,
@@ -6176,10 +6954,10 @@ extern "C" {
  @see hipMallocFromPoolAsync, hipMallocAsync, hipFreeAsync, hipMemPoolGetAttribute,
  hipMemPoolTrimTo, hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolGetAccess
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemPoolSetAccess(
         mem_pool: hipMemPool_t,
         desc_list: *const hipMemAccessDesc,
@@ -6201,10 +6979,10 @@ extern "C" {
  @see hipMallocFromPoolAsync, hipMallocAsync, hipFreeAsync, hipMemPoolGetAttribute,
  hipMemPoolTrimTo, hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemPoolGetAccess(
         flags: *mut hipMemAccessFlags,
         mem_pool: hipMemPool_t,
@@ -6230,10 +7008,10 @@ extern "C" {
  @see hipMallocFromPoolAsync, hipMallocAsync, hipFreeAsync, hipMemPoolGetAttribute, hipMemPoolDestroy,
  hipMemPoolTrimTo, hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemPoolCreate(
         mem_pool: *mut hipMemPool_t,
         pool_props: *const hipMemPoolProps,
@@ -6261,10 +7039,10 @@ extern "C" {
  @see hipMallocFromPoolAsync, hipMallocAsync, hipFreeAsync, hipMemPoolGetAttribute, hipMemPoolCreate
  hipMemPoolTrimTo, hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemPoolDestroy(mem_pool: hipMemPool_t) -> hipError_t;
 }
 extern "C" {
@@ -6296,10 +7074,10 @@ extern "C" {
  @see hipMallocAsync, hipFreeAsync, hipMemPoolGetAttribute, hipMemPoolCreate
  hipMemPoolTrimTo, hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess,
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMallocFromPoolAsync(
         dev_ptr: *mut *mut ::core::ffi::c_void,
         size: usize,
@@ -6317,7 +7095,7 @@ extern "C" {
  The implementation of what the shareable handle is and how it can be transferred is defined by the requested
  handle type.
 
- @note: To create an IPC capable mempool, create a mempool with a @p hipMemAllocationHandleType other
+ @note To create an IPC capable mempool, create a mempool with a @p hipMemAllocationHandleType other
  than @p hipMemHandleTypeNone.
 
  @param [out] shared_handle Pointer to the location in which to store the requested handle
@@ -6329,10 +7107,10 @@ extern "C" {
 
  @see hipMemPoolImportFromShareableHandle
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemPoolExportToShareableHandle(
         shared_handle: *mut ::core::ffi::c_void,
         mem_pool: hipMemPool_t,
@@ -6359,10 +7137,10 @@ extern "C" {
 
  @see hipMemPoolExportToShareableHandle
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemPoolImportFromShareableHandle(
         mem_pool: *mut hipMemPool_t,
         shared_handle: *mut ::core::ffi::c_void,
@@ -6385,10 +7163,10 @@ extern "C" {
 
  @see hipMemPoolImportPointer
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemPoolExportPointer(
         export_data: *mut hipMemPoolPtrExportData,
         dev_ptr: *mut ::core::ffi::c_void,
@@ -6418,10 +7196,10 @@ extern "C" {
 
  @see hipMemPoolExportPointer
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemPoolImportPointer(
         dev_ptr: *mut *mut ::core::ffi::c_void,
         mem_pool: hipMemPool_t,
@@ -6430,17 +7208,21 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /**  @brief Allocate device accessible page locked host memory [Deprecated]
+    /**  @brief Allocate device accessible page locked host memory
 
   @param[out] ptr Pointer to the allocated host pinned memory
   @param[in]  size Requested memory size in bytes
-  @param[in]  flags Type of host memory allocation
+  @param[in]  flags Type of host memory allocation see below
 
   If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
 
-  @return #hipSuccess, #hipErrorOutOfMemory
+  Flags:
+  - #hipHostAllocDefault   Default pinned memory allocation on the host.
+  - #hipHostAllocPortable  Memory is considered allocated by all contexts.
+  - #hipHostAllocMapped    Map the allocation into the address space for the current device.
+  - #hipHostAllocWriteCombined  Allocates the memory as write-combined.
 
-  @warning This API is deprecated, use hipHostMalloc() instead*/
+  @return #hipSuccess, #hipErrorOutOfMemory, #hipErrorInvalidValue*/
     pub fn hipHostAlloc(
         ptr: *mut *mut ::core::ffi::c_void,
         size: usize,
@@ -6455,7 +7237,7 @@ extern "C" {
   @param[in]  hstPtr Host Pointer allocated through hipHostMalloc
   @param[in]  flags Flags to be passed for extension
 
-  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorOutOfMemory
+  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorOutOfMemory
 
   @see hipSetDeviceFlags, hipHostMalloc*/
     pub fn hipHostGetDevicePointer(
@@ -6470,7 +7252,7 @@ extern "C" {
 
   @param[out] flagsPtr Memory location to store flags
   @param[in]  hostPtr Host Pointer allocated through hipHostMalloc
-  @return #hipSuccess, #hipErrorInvalidValue
+  @returns #hipSuccess, #hipErrorInvalidValue
 
   @see hipHostMalloc*/
     pub fn hipHostGetFlags(
@@ -6496,7 +7278,7 @@ extern "C" {
 
   After registering the memory, use #hipHostGetDevicePointer to obtain the mapped device pointer.
   On many systems, the mapped device pointer will have a different value than the mapped host
- pointer.  Applications must use the device pointer in device code, and the host pointer in device
+ pointer.  Applications must use the device pointer in device code, and the host pointer in host
  code.
 
   On some systems, registered memory is pinned.  On some systems, registered memory may not be
@@ -6511,7 +7293,7 @@ extern "C" {
  typically one of the writes will "win" and overwrite data from the other registered memory
  region.
 
-  @return #hipSuccess, #hipErrorOutOfMemory
+  @returns #hipSuccess, #hipErrorOutOfMemory
 
   @see hipHostUnregister, hipHostGetFlags, hipHostGetDevicePointer*/
     pub fn hipHostRegister(
@@ -6525,7 +7307,7 @@ extern "C" {
     /**  @brief Un-register host pointer
 
   @param[in] hostPtr Host pointer previously registered with #hipHostRegister
-  @return Error code
+  @returns Error code
 
   @see hipHostRegister*/
     pub fn hipHostUnregister(hostPtr: *mut ::core::ffi::c_void) -> hipError_t;
@@ -6544,7 +7326,7 @@ extern "C" {
 
   If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
 
-  @return Error code
+  @returns Error code
 
   @see hipMalloc, hipFree, hipMallocArray, hipFreeArray, hipHostFree, hipMalloc3D,
  hipMalloc3DArray, hipHostMalloc*/
@@ -6573,7 +7355,7 @@ extern "C" {
   Given the row and column of an array element of type T, the address is computed as:
   T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column;
 
-  @return Error code
+  @returns Error code
 
   @see hipMalloc, hipFree, hipMallocArray, hipFreeArray, hipHostFree, hipMalloc3D,
  hipMalloc3DArray, hipHostMalloc*/
@@ -6592,8 +7374,8 @@ extern "C" {
   If pointer is NULL, the hip runtime is initialized and hipSuccess is returned.
 
   @param[in] ptr Pointer to memory to be freed
-  @return #hipSuccess
-  @return #hipErrorInvalidDevicePointer (if pointer is invalid, including host pointers allocated
+  @returns #hipSuccess
+  @returns #hipErrorInvalidDevicePointer (if pointer is invalid, including host pointers allocated
  with hipHostMalloc)
 
   @see hipMalloc, hipMallocPitch, hipMallocArray, hipFreeArray, hipHostFree, hipMalloc3D,
@@ -6602,14 +7384,15 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /**  @brief Free memory allocated by the hcc hip host memory allocation API [Deprecated]
+    /**  @brief Frees page-locked memory
+  This API performs an implicit hipDeviceSynchronize() call.
+  If pointer is NULL, the hip runtime is initialized and hipSuccess is returned.
 
   @param[in] ptr Pointer to memory to be freed
-  @return #hipSuccess,
+  @returns #hipSuccess,
           #hipErrorInvalidValue (if pointer is invalid, including device pointers allocated
   with hipMalloc)
-
-  @warning  This API is deprecated, use hipHostFree() instead*/
+*/
     pub fn hipFreeHost(ptr: *mut ::core::ffi::c_void) -> hipError_t;
 }
 extern "C" {
@@ -6618,13 +7401,16 @@ extern "C" {
   This API performs an implicit hipDeviceSynchronize() call.
   If pointer is NULL, the hip runtime is initialized and hipSuccess is returned.
 
+  @ingroup MemoryD
+
   @param[in] ptr Pointer to memory to be freed
-  @return #hipSuccess,
+  @returns #hipSuccess,
           #hipErrorInvalidValue (if pointer is invalid, including device pointers allocated with
  hipMalloc)
 
   @see hipMalloc, hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D,
- hipMalloc3DArray, hipHostMalloc*/
+ hipMalloc3DArray, hipHostMalloc
+*/
     pub fn hipHostFree(ptr: *mut ::core::ffi::c_void) -> hipError_t;
 }
 extern "C" {
@@ -6648,7 +7434,7 @@ extern "C" {
   @param[in]  src Data being copy from
   @param[in]  sizeBytes Data size in bytes
   @param[in]  kind Kind of transfer
-  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorUnknown
+  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorUnknown
 
   @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
  hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
@@ -6673,7 +7459,7 @@ extern "C" {
   @param[in]  sizeBytes Data size in bytes
   @param[in]  kind Kind of transfer
   @param[in]  stream Valid stream
-  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorUnknown, #hipErrorContextIsDestroyed
+  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorUnknown, #hipErrorContextIsDestroyed
 
   @see hipMemcpy, hipStreamCreate, hipStreamSynchronize, hipStreamDestroy, hipSetDevice, hipLaunchKernelGGL
 */
@@ -6693,7 +7479,7 @@ extern "C" {
   @param[in]   src Data being copy from
   @param[in]   sizeBytes Data size in bytes
 
-  @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+  @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
  #hipErrorInvalidValue
 
   @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
@@ -6716,7 +7502,7 @@ extern "C" {
   @param[in]   src Data being copy from
   @param[in]   sizeBytes Data size in bytes
 
-  @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+  @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
  #hipErrorInvalidValue
 
   @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
@@ -6739,7 +7525,7 @@ extern "C" {
   @param[in]   src Data being copy from
   @param[in]   sizeBytes Data size in bytes
 
-  @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+  @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
  #hipErrorInvalidValue
 
   @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
@@ -6763,7 +7549,7 @@ extern "C" {
   @param[in]   srcOffset Offset in bytes of source array
   @param[in]   ByteCount Size of memory copy in bytes
 
-  @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+  @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
  #hipErrorInvalidValue
 
   @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
@@ -6788,7 +7574,7 @@ extern "C" {
   @param[in]   srcDevice Source device pointer
   @param[in]   ByteCount Size of memory copy in bytes
 
-  @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+  @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
  #hipErrorInvalidValue
 
   @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
@@ -6814,7 +7600,7 @@ extern "C" {
   @param[in]   srcOffset Offset in bytes of source array
   @param[in]   ByteCount Size of memory copy in bytes
 
-  @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+  @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
  #hipErrorInvalidValue
 
   @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
@@ -6840,7 +7626,7 @@ extern "C" {
   @param[in]   sizeBytes  Data size in bytes
   @param[in]   stream  Stream identifier
 
-  @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+  @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
  #hipErrorInvalidValue
 
   @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
@@ -6865,7 +7651,7 @@ extern "C" {
   @param[in]   sizeBytes Data size in bytes
   @param[in]   stream  Stream identifier
 
-  @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+  @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
  #hipErrorInvalidValue
 
   @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
@@ -6890,7 +7676,7 @@ extern "C" {
   @param[in]   sizeBytes  Data size in bytes
   @param[in]   stream  Stream identifier
 
-  @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+  @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
  #hipErrorInvalidValue
 
   @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
@@ -6916,7 +7702,7 @@ extern "C" {
   @param[in]   ByteCount Size of memory copy in bytes
   @param[in]   stream Stream identifier
 
-  @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+  @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
  #hipErrorInvalidValue
 
   @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
@@ -6943,7 +7729,7 @@ extern "C" {
   @param[in]   ByteCount Size of memory copy in bytes
   @param[in]   stream Stream identifier
 
-  @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+  @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
  #hipErrorInvalidValue
 
   @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
@@ -6963,6 +7749,8 @@ extern "C" {
 extern "C" {
     #[must_use]
     /**  @brief Returns a global pointer from a module.
+  @ingroup Module
+
   Returns in *dptr and *bytes the pointer and size of the global of name name located in module hmod.
   If no variable of that name exists, it returns hipErrorNotFound. Both parameters dptr and bytes are optional.
   If one of them is NULL, it is ignored and hipSuccess is returned.
@@ -6972,7 +7760,7 @@ extern "C" {
   @param[in]   hmod  Module to retrieve global from
   @param[in]   name  Name of global to retrieve
 
-  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotFound, #hipErrorInvalidContext
+  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotFound, #hipErrorInvalidContext
 */
     pub fn hipModuleGetGlobal(
         dptr: *mut hipDeviceptr_t,
@@ -6988,7 +7776,7 @@ extern "C" {
   @param[out]  devPtr  pointer to the device associated the symbole
   @param[in]   symbol  pointer to the symbole of the device
 
-  @return #hipSuccess, #hipErrorInvalidValue
+  @returns #hipSuccess, #hipErrorInvalidValue
 */
     pub fn hipGetSymbolAddress(
         devPtr: *mut *mut ::core::ffi::c_void,
@@ -7002,7 +7790,7 @@ extern "C" {
   @param[in]   symbol  pointer to the device symbole
   @param[out]  size  pointer to the size
 
-  @return #hipSuccess, #hipErrorInvalidValue
+  @returns #hipSuccess, #hipErrorInvalidValue
 */
     pub fn hipGetSymbolSize(
         size: *mut usize,
@@ -7027,7 +7815,7 @@ extern "C" {
 
  Returns hipSuccess if the returned pfn is addressed to the pointer of found driver function.
 
- @return #hipSuccess, #hipErrorInvalidValue.*/
+ @returns #hipSuccess, #hipErrorInvalidValue.*/
     pub fn hipGetProcAddress(
         symbol: *const ::core::ffi::c_char,
         pfn: *mut *mut ::core::ffi::c_void,
@@ -7054,7 +7842,7 @@ extern "C" {
   @param[in]   offset  offset in bytes from start of symbole
   @param[in]   kind  type of memory transfer
 
-  @return #hipSuccess, #hipErrorInvalidValue
+  @returns #hipSuccess, #hipErrorInvalidValue
 */
     pub fn hipMemcpyToSymbol(
         symbol: *const ::core::ffi::c_void,
@@ -7075,7 +7863,7 @@ extern "C" {
   @param[in]   kind  type of memory transfer
   @param[in]   stream  stream identifier
 
-  @return #hipSuccess, #hipErrorInvalidValue
+  @returns #hipSuccess, #hipErrorInvalidValue
 */
     pub fn hipMemcpyToSymbolAsync(
         symbol: *const ::core::ffi::c_void,
@@ -7096,7 +7884,7 @@ extern "C" {
   @param[in]   offset  Offset in bytes from the start of symbole
   @param[in]   kind  Type of memory transfer
 
-  @return #hipSuccess, #hipErrorInvalidValue
+  @returns #hipSuccess, #hipErrorInvalidValue
 */
     pub fn hipMemcpyFromSymbol(
         dst: *mut ::core::ffi::c_void,
@@ -7117,7 +7905,7 @@ extern "C" {
   @param[in]   kind  type of memory transfer
   @param[in]   stream  stream identifier
 
-  @return #hipSuccess, #hipErrorInvalidValue
+  @returns #hipSuccess, #hipErrorInvalidValue
 */
     pub fn hipMemcpyFromSymbolAsync(
         dst: *mut ::core::ffi::c_void,
@@ -7150,7 +7938,7 @@ extern "C" {
   @param[in]  sizeBytes Data size in bytes
   @param[in]  kind  Type of memory transfer
   @param[in]  stream  Stream identifier
-  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorUnknown
+  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorUnknown
 
   @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray,
  hipMemcpy2DFromArray, hipMemcpyArrayToArray, hipMemcpy2DArrayToArray, hipMemcpyToSymbol,
@@ -7173,7 +7961,7 @@ extern "C" {
   @param[out] dst  Data being filled
   @param[in]  value  Value to be set
   @param[in]  sizeBytes  Data size in bytes
-  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized*/
+  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized*/
     pub fn hipMemset(
         dst: *mut ::core::ffi::c_void,
         value: ::core::ffi::c_int,
@@ -7188,7 +7976,7 @@ extern "C" {
   @param[out] dest  Data ptr to be filled
   @param[in]  value  Value to be set
   @param[in]  count  Number of values to be set
-  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized*/
+  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized*/
     pub fn hipMemsetD8(
         dest: hipDeviceptr_t,
         value: ::core::ffi::c_uchar,
@@ -7209,7 +7997,7 @@ extern "C" {
   @param[in]  value  Constant value to be set
   @param[in]  count  Number of values to be set
   @param[in]  stream  Stream identifier
-  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized*/
+  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized*/
     pub fn hipMemsetD8Async(
         dest: hipDeviceptr_t,
         value: ::core::ffi::c_uchar,
@@ -7225,7 +8013,7 @@ extern "C" {
   @param[out] dest  Data ptr to be filled
   @param[in]  value  Constant value to be set
   @param[in]  count  Number of values to be set
-  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized*/
+  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized*/
     pub fn hipMemsetD16(
         dest: hipDeviceptr_t,
         value: ::core::ffi::c_ushort,
@@ -7246,7 +8034,7 @@ extern "C" {
   @param[in]  value  Constant value to be set
   @param[in]  count  Number of values to be set
   @param[in]  stream  Stream identifier
-  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized*/
+  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized*/
     pub fn hipMemsetD16Async(
         dest: hipDeviceptr_t,
         value: ::core::ffi::c_ushort,
@@ -7262,7 +8050,7 @@ extern "C" {
   @param[out] dest  Data being filled
   @param[in]  value  Constant value to be set
   @param[in]  count  Number of values to be set
-  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized*/
+  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized*/
     pub fn hipMemsetD32(
         dest: hipDeviceptr_t,
         value: ::core::ffi::c_int,
@@ -7322,7 +8110,7 @@ extern "C" {
   @param[in]  value  Constant value to be set
   @param[in]  width
   @param[in]  height
-  @return #hipSuccess, #hipErrorInvalidValue*/
+  @returns #hipSuccess, #hipErrorInvalidValue*/
     pub fn hipMemset2D(
         dst: *mut ::core::ffi::c_void,
         pitch: usize,
@@ -7341,7 +8129,7 @@ extern "C" {
   @param[in]  width  Width of matrix set columns in bytes
   @param[in]  height  Height of matrix set rows in bytes
   @param[in]  stream  Stream identifier
-  @return #hipSuccess, #hipErrorInvalidValue*/
+  @returns #hipSuccess, #hipErrorInvalidValue*/
     pub fn hipMemset2DAsync(
         dst: *mut ::core::ffi::c_void,
         pitch: usize,
@@ -7358,7 +8146,7 @@ extern "C" {
   @param[in] pitchedDevPtr  Pointer to pitched device memory
   @param[in]  value  Value to set for each byte of specified memory
   @param[in]  extent  Size parameters for width field in bytes in device memory
-  @return #hipSuccess, #hipErrorInvalidValue*/
+  @returns #hipSuccess, #hipErrorInvalidValue*/
     pub fn hipMemset3D(
         pitchedDevPtr: hipPitchedPtr,
         value: ::core::ffi::c_int,
@@ -7373,7 +8161,7 @@ extern "C" {
   @param[in]  value  Value to set for each byte of specified memory
   @param[in]  extent  Size parameters for width field in bytes in device memory
   @param[in]  stream  Stream identifier
-  @return #hipSuccess, #hipErrorInvalidValue*/
+  @returns #hipSuccess, #hipErrorInvalidValue*/
     pub fn hipMemset3DAsync(
         pitchedDevPtr: hipPitchedPtr,
         value: ::core::ffi::c_int,
@@ -7395,7 +8183,7 @@ extern "C" {
  @param[out] free Returns free memory on the current device in bytes
  @param[out] total Returns total allocatable memory on the current device in bytes
 
- @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
 */
     pub fn hipMemGetInfo(free: *mut usize, total: *mut usize) -> hipError_t;
 }
@@ -7408,7 +8196,7 @@ extern "C" {
  @param[in] ptr Pointer to allocated memory
  @param[out] size Returns the allocated memory size in bytes
 
- @return #hipSuccess, #hipErrorInvalidValue
+ @returns #hipSuccess, #hipErrorInvalidValue
 */
     pub fn hipMemPtrGetInfo(
         ptr: *mut ::core::ffi::c_void,
@@ -7424,7 +8212,7 @@ extern "C" {
   @param[in]   width  Requested array allocation width
   @param[in]   height Requested array allocation height
   @param[in]   flags  Requested properties of allocated array
-  @return      #hipSuccess, #hipErrorOutOfMemory
+  @returns     #hipSuccess, #hipErrorOutOfMemory
 
   @see hipMalloc, hipMallocPitch, hipFree, hipFreeArray, hipHostMalloc, hipHostFree*/
     pub fn hipMallocArray(
@@ -7442,7 +8230,7 @@ extern "C" {
   @param[out]  pHandle  Pointer to the array memory
   @param[in]   pAllocateArray   Requested array desciptor
 
-  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
 
   @see hipMallocArray, hipArrayDestroy, hipFreeArray*/
     pub fn hipArrayCreate(
@@ -7456,7 +8244,7 @@ extern "C" {
 
   @param[in]  array  Pointer to the array memory
 
-  @return      #hipSuccess, #hipErrorInvalidValue
+  @returns     #hipSuccess, #hipErrorInvalidValue
 
   @see hipArrayCreate, hipArrayDestroy, hipFreeArray*/
     pub fn hipArrayDestroy(array: hipArray_t) -> hipError_t;
@@ -7468,7 +8256,7 @@ extern "C" {
   @param[out]  array  Pointer to the 3D array memory
   @param[in]   pAllocateArray   Requested array desciptor
 
-  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
 
   @see hipMallocArray, hipArrayDestroy, hipFreeArray*/
     pub fn hipArray3DCreate(
@@ -7483,7 +8271,7 @@ extern "C" {
   @param[out]  pitchedDevPtr  Pointer to the 3D memory
   @param[in]   extent   Requested extent
 
-  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
 
   @see hipMallocPitch, hipMemGetInfo, hipFree*/
     pub fn hipMalloc3D(
@@ -7496,7 +8284,7 @@ extern "C" {
     /**  @brief Frees an array on the device.
 
   @param[in]  array  Pointer to array to free
-  @return     #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+  @returns    #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
 
   @see hipMalloc, hipMallocPitch, hipFree, hipMallocArray, hipHostMalloc, hipHostFree*/
     pub fn hipFreeArray(array: hipArray_t) -> hipError_t;
@@ -7509,7 +8297,7 @@ extern "C" {
   @param[in]   desc   Requested channel format
   @param[in]   extent Requested array allocation width, height and depth
   @param[in]   flags  Requested properties of allocated array
-  @return      #hipSuccess, #hipErrorOutOfMemory
+  @returns     #hipSuccess, #hipErrorOutOfMemory
 
   @see hipMalloc, hipMallocPitch, hipFree, hipFreeArray, hipHostMalloc, hipHostFree*/
     pub fn hipMalloc3DArray(
@@ -7528,7 +8316,7 @@ extern "C" {
  @param[out] flags  - Returned array flags
  @param[in]  array  - The HIP array to get info for
 
- @return #hipSuccess, #hipErrorInvalidValue #hipErrorInvalidHandle
+ @returns #hipSuccess, #hipErrorInvalidValue #hipErrorInvalidHandle
 
  @see hipArrayGetDescriptor, hipArray3DGetDescriptor*/
     pub fn hipArrayGetInfo(
@@ -7545,7 +8333,7 @@ extern "C" {
  @param[out] pArrayDescriptor - Returned array descriptor
  @param[in]  array            - Array to get descriptor of
 
- @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
  #hipErrorInvalidValue #hipErrorInvalidHandle
 
  @see hipArray3DCreate, hipArray3DGetDescriptor, hipArrayCreate, hipArrayDestroy, hipMemAlloc,
@@ -7567,7 +8355,7 @@ extern "C" {
  @param[out] pArrayDescriptor - Returned 3D array descriptor
  @param[in]  array            - 3D array to get descriptor of
 
- @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
  #hipErrorInvalidValue #hipErrorInvalidHandle, #hipErrorContextIsDestroyed
 
  @see hipArray3DCreate, hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc,
@@ -7593,7 +8381,7 @@ extern "C" {
   @param[in]   width  Width of matrix transfer (columns in bytes)
   @param[in]   height Height of matrix transfer (rows)
   @param[in]   kind   Type of transfer
-  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
  #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
 
   @see hipMemcpy, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpyToSymbol,
@@ -7612,7 +8400,7 @@ extern "C" {
     #[must_use]
     /**  @brief Copies memory for 2D arrays.
   @param[in]   pCopy Parameters for the memory copy
-  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
   #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
 
   @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray,
@@ -7624,7 +8412,7 @@ extern "C" {
     /**  @brief Copies memory for 2D arrays.
   @param[in]   pCopy Parameters for the memory copy
   @param[in]   stream Stream to use
-  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
  #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
 
   @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray,
@@ -7646,7 +8434,7 @@ extern "C" {
   @param[in]   height Height of matrix transfer (rows)
   @param[in]   kind   Type of transfer
   @param[in]   stream Stream to use
-  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
  #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
 
   @see hipMemcpy, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpyToSymbol,
@@ -7674,7 +8462,7 @@ extern "C" {
   @param[in]   width   Width of matrix transfer (columns in bytes)
   @param[in]   height  Height of matrix transfer (rows)
   @param[in]   kind    Type of transfer
-  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
  #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
 
   @see hipMemcpy, hipMemcpyToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
@@ -7703,7 +8491,7 @@ extern "C" {
   @param[in]   height  Height of matrix transfer (rows)
   @param[in]   kind    Type of transfer
   @param[in]   stream    Accelerator view which the copy is being enqueued
-  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
  #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
 
   @see hipMemcpy, hipMemcpyToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
@@ -7734,7 +8522,7 @@ extern "C" {
   @param[in]   height  Height of matrix transfer (rows)
   @param[in]   kind Type of transfer
 
-  @returns      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidMemcpyDirection
+  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidMemcpyDirection
 
   @see hipMemcpy, hipMemcpyToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
  hipMemcpyAsync*/
@@ -7752,7 +8540,9 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /**  @brief Copies data between host and device.
+    /**  @brief Copies data between host and device [Deprecated]
+
+  @ingroup MemoryD
 
   @param[in]   dst     Destination memory address
   @param[in]   wOffset Destination starting X offset
@@ -7760,7 +8550,7 @@ extern "C" {
   @param[in]   src     Source memory address
   @param[in]   count   size in bytes to copy
   @param[in]   kind    Type of transfer
-  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
  #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
 
   @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
@@ -7777,7 +8567,9 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /**  @brief Copies data between host and device.
+    /**  @brief Copies data between host and device [Deprecated]
+
+  @ingroup MemoryD
 
   @param[in]   dst       Destination memory address
   @param[in]   srcArray  Source memory address
@@ -7785,7 +8577,7 @@ extern "C" {
   @param[in]   hOffset   Source starting Y offset
   @param[in]   count     Size in bytes to copy
   @param[in]   kind      Type of transfer
-  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
  #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
 
   @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
@@ -7812,7 +8604,7 @@ extern "C" {
   @param[in]   width     Width of matrix transfer (columns in bytes)
   @param[in]   height    Height of matrix transfer (rows)
   @param[in]   kind      Type of transfer
-  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
  #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
 
   @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
@@ -7841,7 +8633,7 @@ extern "C" {
   @param[in]   height    Height of matrix transfer (rows)
   @param[in]   kind      Type of transfer
   @param[in]   stream    Accelerator view which the copy is being enqueued
-  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
  #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
 
   @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
@@ -7866,7 +8658,7 @@ extern "C" {
   @param[in]   srcArray  Source array
   @param[in]   srcOffset Offset in bytes of source array
   @param[in]   count     Size of memory copy in bytes
-  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
  #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
 
   @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
@@ -7886,7 +8678,7 @@ extern "C" {
   @param[in]   dstOffset  Offset in bytes of destination array
   @param[in]   srcHost    Source host pointer
   @param[in]   count      Size of memory copy in bytes
-  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
  #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
 
   @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
@@ -7903,7 +8695,7 @@ extern "C" {
     /**  @brief Copies data between host and device.
 
   @param[in]   p   3D memory copy parameters
-  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
  #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
 
   @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
@@ -7916,7 +8708,7 @@ extern "C" {
 
   @param[in]   p        3D memory copy parameters
   @param[in]   stream   Stream to use
-  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
  #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
 
   @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
@@ -7931,7 +8723,7 @@ extern "C" {
     /**  @brief Copies data between host and device.
 
   @param[in]   pCopy   3D memory copy parameters
-  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
   #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
 
   @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
@@ -7944,7 +8736,7 @@ extern "C" {
 
   @param[in]   pCopy    3D memory copy parameters
   @param[in]   stream   Stream to use
-  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
   #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
 
   @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
@@ -8063,7 +8855,7 @@ extern "C" {
  @param [in] flags  Context creation flags
  @param [in] device  device handle
 
- @return #hipSuccess
+ @returns #hipSuccess
 
  @see hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent, hipCtxPushCurrent,
  hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
@@ -8079,7 +8871,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Destroy a HIP context.
+    /** @brief Destroy a HIP context [Deprecated]
 
  @param [in] ctx Context to destroy
 
@@ -8094,7 +8886,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Pop the current/default context and return the popped context.
+    /** @brief Pop the current/default context and return the popped context [Deprecated]
 
  @param [out] ctx  The current context to pop
 
@@ -8109,7 +8901,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Push the context to be set as current/ default context
+    /** @brief Push the context to be set as current/ default context [Deprecated]
 
  @param [in] ctx  The current context to push
 
@@ -8124,7 +8916,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Set the passed context as current/default
+    /** @brief Set the passed context as current/default [Deprecated]
 
  @param [in] ctx The context to set as current
 
@@ -8139,7 +8931,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Get the handle of the current/ default context
+    /** @brief Get the handle of the current/ default context [Deprecated]
 
  @param [out] ctx  The context to get as current
 
@@ -8154,7 +8946,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Get the handle of the device associated with current/default context
+    /** @brief Get the handle of the device associated with current/default context [Deprecated]
 
  @param [out] device The device from the current context
 
@@ -8171,10 +8963,10 @@ extern "C" {
     #[must_use]
     /** @brief Returns the approximate HIP api version.
 
- @param [in]  ctx Context to check
+ @param [in]  ctx Context to check [Deprecated]
  @param [out] apiVersion API version to get
 
- @return #hipSuccess
+ @returns #hipSuccess
 
  @warning The HIP feature set does not correspond to an exact CUDA SDK api revision.
  This function always set *apiVersion to 4 as an approximation though HIP supports
@@ -8194,11 +8986,11 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Get Cache configuration for a specific function
+    /** @brief Get Cache configuration for a specific function [Deprecated]
 
  @param [out] cacheConfig  Cache configuration
 
- @return #hipSuccess
+ @returns #hipSuccess
 
  @warning AMD devices and some Nvidia GPUS do not support reconfigurable cache.  This hint is
  ignored on those architectures.
@@ -8212,7 +9004,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Set L1/Shared cache partition.
+    /** @brief Set L1/Shared cache partition [Deprecated]
 
  @param [in] cacheConfig  Cache configuration to set
 
@@ -8230,7 +9022,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Set Shared memory bank configuration.
+    /** @brief Set Shared memory bank configuration  [Deprecated]
 
  @param [in] config  Shared memory configuration to set
 
@@ -8248,7 +9040,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Get Shared memory bank configuration.
+    /** @brief Get Shared memory bank configuration [Deprecated]
 
  @param [out] pConfig  Pointer of shared memory configuration
 
@@ -8266,7 +9058,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Blocks until the default context has completed all preceding requested tasks.
+    /** @brief Blocks until the default context has completed all preceding requested tasks [Deprecated]
 
  @return #hipSuccess
 
@@ -8282,7 +9074,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Return flags used for creating default context.
+    /** @brief Return flags used for creating default context [Deprecated]
 
  @param [out] flags  Pointer of flags
 
@@ -8297,7 +9089,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Enables direct access to memory allocations in a peer context.
+    /** @brief Enables direct access to memory allocations in a peer context [Deprecated]
 
  Memory which already allocated on peer device will be mapped into the address space of the
  current device.  In addition, all future memory allocations on peerDeviceId will be mapped into
@@ -8326,7 +9118,7 @@ extern "C" {
     #[must_use]
     /** @brief Disable direct access from current context's virtual address space to memory allocations
  physically located on a peer context.Disables direct access to memory allocations in a peer
- context and unregisters any registered allocations.
+ context and unregisters any registered allocations [Deprecated]
 
  Returns #hipErrorPeerAccessNotEnabled if direct access to memory on peerDevice has not yet been
  enabled from the current device.
@@ -8345,7 +9137,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Get the state of the primary context.
+    /** @brief Get the state of the primary context [Deprecated]
 
  @param [in] dev  Device to get primary context flags for
  @param [out] flags  Pointer to store flags
@@ -8368,7 +9160,7 @@ extern "C" {
     #[must_use]
     /** @brief Release the primary context on the GPU.
 
- @param [in] dev  Device which primary context is released
+ @param [in] dev  Device which primary context is released [Deprecated]
 
  @returns #hipSuccess
 
@@ -8383,7 +9175,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Retain the primary context on the GPU.
+    /** @brief Retain the primary context on the GPU [Deprecated]
 
  @param [out] pctx  Returned context handle of the new context
  @param [in] dev  Device which primary context is released
@@ -8402,7 +9194,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Resets the primary context on the GPU.
+    /** @brief Resets the primary context on the GPU [Deprecated]
 
  @param [in] dev  Device which primary context is reset
 
@@ -8417,7 +9209,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Set flags for the primary context.
+    /** @brief Set flags for the primary context [Deprecated]
 
  @param [in] dev  Device for which the primary context flags are set
  @param [in] flags  New flags for the device
@@ -8472,7 +9264,7 @@ extern "C" {
 extern "C" {
     #[must_use]
     /** @brief Find out attributes for a given function.
-
+ @ingroup Execution
  @param [out] attr  Attributes of funtion
  @param [in] func  Pointer to the function handle
 
@@ -8485,7 +9277,7 @@ extern "C" {
 extern "C" {
     #[must_use]
     /** @brief Find out a specific attribute for a given function.
-
+ @ingroup Execution
  @param [out] value  Pointer to the value
  @param [in]  attrib  Attributes of the given funtion
  @param [in]  hfunc  Function to get attributes from
@@ -8560,11 +9352,112 @@ extern "C" {
         optionValues: *mut *mut ::core::ffi::c_void,
     ) -> hipError_t;
 }
+extern "C" {
+    #[must_use]
+    /** @brief Completes the linking of the given program.
+ @param [in] state hip link state
+ @param [in] type  Type of the input data or bitcode
+ @param [in] data  Input data which is null terminated
+ @param [in] size  Size of the input data
+ @param [in] name  Optional name for this input
+ @param [in] numOptions  Size of the options
+ @param [in] options  Array of options applied to this input
+ @param [in] optionValues  Array of option values cast to void*
+
+ @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidHandle
+
+ If adding the file fails, it will
+ @return #hipErrorInvalidConfiguration
+
+ @see hipError_t*/
+    pub fn hipLinkAddData(
+        state: hipLinkState_t,
+        type_: hipJitInputType,
+        data: *mut ::core::ffi::c_void,
+        size: usize,
+        name: *const ::core::ffi::c_char,
+        numOptions: ::core::ffi::c_uint,
+        options: *mut hipJitOption,
+        optionValues: *mut *mut ::core::ffi::c_void,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[must_use]
+    /** @brief Adds a file with bit code to be linked with options
+ @param [in] state hip link state
+ @param [in] type  Type of the input data or bitcode
+ @param [in] path  Path to the input file where bitcode is present
+ @param [in] numOptions  Size of the options
+ @param [in] options  Array of options applied to this input
+ @param [in] optionValues  Array of option values cast to void*
+
+ @returns #hipSuccess, #hipErrorInvalidValue
+
+ If adding the file fails, it will
+ @return #hipErrorInvalidConfiguration
+
+ @see hipError_t*/
+    pub fn hipLinkAddFile(
+        state: hipLinkState_t,
+        type_: hipJitInputType,
+        path: *const ::core::ffi::c_char,
+        numOptions: ::core::ffi::c_uint,
+        options: *mut hipJitOption,
+        optionValues: *mut *mut ::core::ffi::c_void,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[must_use]
+    /** @brief Completes the linking of the given program.
+ @param [in]   state hip link state
+ @param [out]  hipBinOut  Upon success, points to the output binary
+ @param [out]  sizeOut  Size of the binary is stored (optional)
+
+ @returns #hipSuccess #hipErrorInvalidValue
+
+ If adding the data fails, it will
+ @return #hipErrorInvalidConfiguration
+
+ @see hipError_t*/
+    pub fn hipLinkComplete(
+        state: hipLinkState_t,
+        hipBinOut: *mut *mut ::core::ffi::c_void,
+        sizeOut: *mut usize,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[must_use]
+    /** @brief Creates the link instance via hip APIs.
+ @param [in] numOptions  Number of options
+ @param [in] option  Array of options
+ @param [in] optionValues  Array of option values cast to void*
+ @param [out] stateOut  hip link state created upon success
+
+ @returns #hipSuccess #hipErrorInvalidValue #hipErrorInvalidConfiguration
+
+ @see hipSuccess*/
+    pub fn hipLinkCreate(
+        numOptions: ::core::ffi::c_uint,
+        options: *mut hipJitOption,
+        optionValues: *mut *mut ::core::ffi::c_void,
+        stateOut: *mut hipLinkState_t,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[must_use]
+    /** @brief Deletes the link instance via hip APIs.
+ @param [in] state link state instance
+
+ @returns #hipSuccess #hipErrorInvalidValue
+
+ @see hipSuccess*/
+    pub fn hipLinkDestroy(state: hipLinkState_t) -> hipError_t;
+}
 extern "C" {
     #[must_use]
     /** @brief launches kernel f with launch parameters and shared memory on stream with arguments passed
  to kernelparams or extra
-
+ @ingroup Execution
  @param [in] f         Kernel to launch.
  @param [in] gridDimX  X grid dimension specified as multiple of blockDimX.
  @param [in] gridDimY  Y grid dimension specified as multiple of blockDimY.
@@ -8604,29 +9497,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief launches kernel f with launch parameters and shared memory on stream with arguments passed
- to kernelParams, where thread blocks can cooperate and synchronize as they execute
-
- @param [in] f              Kernel to launch.
- @param [in] gridDimX       X grid dimension specified as multiple of blockDimX.
- @param [in] gridDimY       Y grid dimension specified as multiple of blockDimY.
- @param [in] gridDimZ       Z grid dimension specified as multiple of blockDimZ.
- @param [in] blockDimX      X block dimension specified in work-items.
- @param [in] blockDimY      Y block dimension specified in work-items.
- @param [in] blockDimZ      Z block dimension specified in work-items.
- @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The
- HIP-Clang compiler provides support for extern shared declarations.
- @param [in] stream         Stream where the kernel should be dispatched. May be 0,
- in which case the default stream is used with associated synchronization rules.
- @param [in] kernelParams   A list of kernel arguments.
-
- Please note, HIP does not support kernel launch with total work items defined in dimension with
- size gridDim x blockDim >= 2^32.
-
- @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
- #hipErrorInvalidHandle, #hipErrorInvalidImage, #hipErrorInvalidValue,
- #hipErrorInvalidConfiguration, #hipErrorLaunchFailure, #hipErrorLaunchOutOfResources,
- #hipErrorLaunchTimeOut, #hipErrorCooperativeLaunchTooLarge, #hipErrorSharedObjectInitFailed*/
+    #[doc = " \\addtogroup ModuleCooperativeG Cooperative groups kernel launch of Module management.\n \\ingroup Module\n  @{ */\n/**\n @brief launches kernel f with launch parameters and shared memory on stream with arguments passed\n to kernelParams, where thread blocks can cooperate and synchronize as they execute\n\n @param [in] f              Kernel to launch.\n @param [in] gridDimX       X grid dimension specified as multiple of blockDimX.\n @param [in] gridDimY       Y grid dimension specified as multiple of blockDimY.\n @param [in] gridDimZ       Z grid dimension specified as multiple of blockDimZ.\n @param [in] blockDimX      X block dimension specified in work-items.\n @param [in] blockDimY      Y block dimension specified in work-items.\n @param [in] blockDimZ      Z block dimension specified in work-items.\n @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The\n HIP-Clang compiler provides support for extern shared declarations.\n @param [in] stream         Stream where the kernel should be dispatched. May be 0,\n in which case the default stream is used with associated synchronization rules.\n @param [in] kernelParams   A list of kernel arguments.\n\n Please note, HIP does not support kernel launch with total work items defined in dimension with\n size \\f$ gridDim \\cdot blockDim \\geq 2^{32} \\f$.\n\n @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,\n #hipErrorInvalidHandle, #hipErrorInvalidImage, #hipErrorInvalidValue,\n #hipErrorInvalidConfiguration, #hipErrorLaunchFailure, #hipErrorLaunchOutOfResources,\n #hipErrorLaunchTimeOut, #hipErrorCooperativeLaunchTooLarge, #hipErrorSharedObjectInitFailed"]
     pub fn hipModuleLaunchCooperativeKernel(
         f: hipFunction_t,
         gridDimX: ::core::ffi::c_uint,
@@ -8662,22 +9533,24 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief launches kernel f with launch parameters and shared memory on stream with arguments passed
- to kernelparams or extra, where thread blocks can cooperate and synchronize as they execute
+    /** @brief Launches kernel f with launch parameters and shared memory on stream with arguments passed
+ to kernelparams or extra, where thread blocks can cooperate and synchronize as they execute.
 
- @param [in] f         Kernel to launch.
- @param [in] gridDim   Grid dimensions specified as multiple of blockDim.
- @param [in] blockDimX  Block dimensions specified in work-items
- @param [in] kernelParams A list of kernel arguments
- @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The
+ @param [in] f - Kernel to launch.
+ @param [in] gridDim - Grid dimensions specified as multiple of blockDim.
+ @param [in] blockDimX - Block dimensions specified in work-items
+ @param [in] kernelParams - Pointer of arguments passed to the kernel. If the kernel has multiple
+ parameters, 'kernelParams' should be array of pointers, each points the corresponding argument.
+ @param [in] sharedMemBytes - Amount of dynamic shared memory to allocate for this kernel. The
  HIP-Clang compiler provides support for extern shared declarations.
- @param [in] stream    Stream where the kernel should be dispatched.  May be 0, in which case th
+ @param [in] stream - Stream where the kernel should be dispatched.  May be 0, in which case th
  default stream is used with associated synchronization rules.
 
  Please note, HIP does not support kernel launch with total work items defined in dimension with
- size gridDim x blockDim >= 2^32.
+ size \f$ gridDim \cdot blockDim \geq 2^{32} \f$.
 
- @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue, #hipErrorCooperativeLaunchTooLarge*/
+ @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue,
+ #hipErrorCooperativeLaunchTooLarge*/
     pub fn hipLaunchCooperativeKernel(
         f: *const ::core::ffi::c_void,
         gridDim: dim3,
@@ -8708,8 +9581,7 @@ extern "C" {
     #[must_use]
     /** @brief Launches kernels on multiple devices and guarantees all specified kernels are dispatched
  on respective streams before enqueuing any other work on the specified streams from any other threads
-
-
+ @ingroup Execution
  @param [in] launchParamsList          List of launch parameters, one per device.
  @param [in] numDevices               Size of the launchParamsList array.
  @param [in] flags                    Flags to control launch behavior.
@@ -8848,15 +9720,15 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Start recording of profiling information
+    /** @brief Start recording of profiling information [Deprecated]
  When using this API, start the profiler with profiling disabled.  (--startdisabled)
  @returns  #hipErrorNotSupported
- @warning : hipProfilerStart API is deprecated, use roctracer/rocTX instead.*/
+ @warning hipProfilerStart API is deprecated, use roctracer/rocTX instead.*/
     pub fn hipProfilerStart() -> hipError_t;
 }
 extern "C" {
     #[must_use]
-    /** @brief Stop recording of profiling information.
+    /** @brief Stop recording of profiling information [Deprecated]
  When using this API, start the profiler with profiling disabled.  (--startdisabled)
  @returns  #hipErrorNotSupported
  @warning  hipProfilerStart API is deprecated, use roctracer/rocTX instead.*/
@@ -8902,10 +9774,11 @@ extern "C" {
     #[must_use]
     /** @brief C compliant kernel launch API
 
- @param [in] function_address - kernel stub function pointer.
- @param [in] numBlocks - number of blocks
- @param [in] dimBlocks - dimension of a block
- @param [in] args - kernel arguments
+ @param [in] function_address - Kernel stub function pointer.
+ @param [in] numBlocks - Number of blocks.
+ @param [in] dimBlocks - Dimension of a block
+ @param [in] args - Pointer of arguments passed to the kernel. If the kernel has multiple
+ parameters, 'args' should be array of pointers, each points the corresponding argument.
  @param [in] sharedMemBytes - Amount of dynamic shared memory to allocate for this kernel. The
  HIP-Clang compiler provides support for extern shared declarations.
  @param [in] stream - Stream where the kernel should be dispatched.  May be 0, in which case th
@@ -8966,19 +9839,20 @@ extern "C" {
     #[must_use]
     /** @brief Launches kernel from the pointer address, with arguments and shared memory on stream.
 
- @param [in] function_address pointer to the Kernel to launch.
- @param [in] numBlocks number of blocks.
- @param [in] dimBlocks dimension of a block.
- @param [in] args pointer to kernel arguments.
- @param [in] sharedMemBytes  Amount of dynamic shared memory to allocate for this kernel.
+ @param [in] function_address - Pointer to the Kernel to launch.
+ @param [in] numBlocks -  Number of blocks.
+ @param [in] dimBlocks - Dimension of a block.
+ @param [in] args - Pointer of arguments passed to the kernel. If the kernel has multiple
+ parameters, 'args' should be array of pointers, each points the corresponding argument.
+ @param [in] sharedMemBytes - Amount of dynamic shared memory to allocate for this kernel.
  HIP-Clang compiler provides support for extern shared declarations.
- @param [in] stream  Stream where the kernel should be dispatched.
+ @param [in] stream - Stream where the kernel should be dispatched.
  May be 0, in which case the default stream is used with associated synchronization rules.
- @param [in] startEvent  If non-null, specified event will be updated to track the start time of
+ @param [in] startEvent - If non-null, specified event will be updated to track the start time of
  the kernel launch. The event must be created before calling this API.
- @param [in] stopEvent  If non-null, specified event will be updated to track the stop time of
+ @param [in] stopEvent - If non-null, specified event will be updated to track the stop time of
  the kernel launch. The event must be created before calling this API.
- @param [in] flags  The value of hipExtAnyOrderLaunch, signifies if kernel can be
+ @param [in] flags - The value of hipExtAnyOrderLaunch, signifies if kernel can be
  launched in any order.
  @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue.
 */
@@ -9163,7 +10037,7 @@ extern "C" {
 
  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryAllocation
 
- @note  This API is implemented on Windows, under development on Linux.
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.
 */
     pub fn hipMallocMipmappedArray(
         mipmappedArray: *mut hipMipmappedArray_t,
@@ -9181,7 +10055,7 @@ extern "C" {
 
  @return #hipSuccess, #hipErrorInvalidValue
 
- @note  This API is implemented on Windows, under development on Linux.
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.
 */
     pub fn hipFreeMipmappedArray(mipmappedArray: hipMipmappedArray_t) -> hipError_t;
 }
@@ -9195,7 +10069,7 @@ extern "C" {
 
  @return #hipSuccess, #hipErrorInvalidValue
 
- @note  This API is implemented on Windows, under development on Linux.
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.
 */
     pub fn hipGetMipmappedArrayLevel(
         levelArray: *mut hipArray_t,
@@ -9213,7 +10087,7 @@ extern "C" {
 
  @returns #hipSuccess, #hipErrorNotSupported, #hipErrorInvalidValue
 
- @note  This API is implemented on Windows, under development on Linux.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMipmappedArrayCreate(
         pHandle: *mut hipMipmappedArray_t,
         pMipmappedArrayDesc: *mut HIP_ARRAY3D_DESCRIPTOR,
@@ -9228,7 +10102,7 @@ extern "C" {
 
  @returns #hipSuccess, #hipErrorInvalidValue
 
- @note  This API is implemented on Windows, under development on Linux.
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.
 */
     pub fn hipMipmappedArrayDestroy(hMipmappedArray: hipMipmappedArray_t) -> hipError_t;
 }
@@ -9242,7 +10116,7 @@ extern "C" {
 
  @returns #hipSuccess, #hipErrorInvalidValue
 
- @note  This API is implemented on Windows, under development on Linux.
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.
 */
     pub fn hipMipmappedArrayGetLevel(
         pLevelArray: *mut hipArray_t,
@@ -9252,7 +10126,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief  Binds a mipmapped array to a texture.
+    /** @brief  Binds a mipmapped array to a texture [Deprecated]
 
  @param [in] tex  pointer to the texture reference to bind
  @param [in] mipmappedArray memory mipmapped array on the device
@@ -9268,7 +10142,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Gets the texture reference related with the symbol.
+    /** @brief Gets the texture reference related with the symbol [Deprecated]
 
  @param [out] texref  texture reference
  @param [in] symbol  pointer to the symbol related with the texture for the reference
@@ -9283,7 +10157,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Gets the border color used by a texture reference.
+    /** @brief Gets the border color used by a texture reference [Deprecated]
 
  @param [out] pBorderColor  Returned Type and Value of RGBA color.
  @param [in] texRef  Texture reference.
@@ -9298,7 +10172,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Gets the array bound to a texture reference.
+    /** @brief Gets the array bound to a texture reference [Deprecated]
 
 
  @param [in] pArray  Returned array.
@@ -9314,7 +10188,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Sets address mode for a texture reference.
+    /** @brief Sets address mode for a texture reference [Deprecated]
 
  @param [in] texRef  texture reference.
  @param [in] dim  Dimension of the texture.
@@ -9331,7 +10205,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Binds an array as a texture reference.
+    /** @brief Binds an array as a texture reference [Deprecated]
 
  @param [in] tex  Pointer texture reference.
  @param [in] array  Array to bind.
@@ -9349,7 +10223,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Set filter mode for a texture reference.
+    /** @brief Set filter mode for a texture reference [Deprecated]
 
  @param [in] texRef  Pointer texture reference.
  @param [in] fm  Value of texture filter mode.
@@ -9365,7 +10239,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Set flags for a texture reference.
+    /** @brief Set flags for a texture reference [Deprecated]
 
  @param [in] texRef  Pointer texture reference.
  @param [in] Flags  Value of flags.
@@ -9381,7 +10255,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Set format for a texture reference.
+    /** @brief Set format for a texture reference [Deprecated]
 
  @param [in] texRef  Pointer texture reference.
  @param [in] fmt  Value of format.
@@ -9399,7 +10273,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Binds a memory area to a texture.
+    /** @brief Binds a memory area to a texture [Deprecated]
 
  @param [in] offset  Offset in bytes.
  @param [in] tex  Texture to bind.
@@ -9421,7 +10295,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Binds a 2D memory area to a texture.
+    /** @brief Binds a 2D memory area to a texture [Deprecated]
 
  @param [in] offset  Offset in bytes.
  @param [in] tex  Texture to bind.
@@ -9447,7 +10321,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Binds a memory area to a texture.
+    /** @brief Binds a memory area to a texture [Deprecated]
 
  @param [in] tex  Pointer of texture reference.
  @param [in] array  Array to bind.
@@ -9465,7 +10339,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Get the offset of the alignment in a texture.
+    /** @brief Get the offset of the alignment in a texture [Deprecated]
 
  @param [in] offset  Offset in bytes.
  @param [in] texref  Pointer of texture reference.
@@ -9481,7 +10355,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Unbinds a texture.
+    /** @brief Unbinds a texture [Deprecated]
 
  @param [in] tex  Texture to unbind.
 
@@ -9493,7 +10367,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Gets the address for a texture reference.
+    /** @brief Gets the address for a texture reference [Deprecated]
 
  @param [out] dev_ptr  Pointer of device address.
  @param [in] texRef  Pointer of texture reference.
@@ -9509,7 +10383,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Gets the address mode for a texture reference.
+    /** @brief Gets the address mode for a texture reference [Deprecated]
 
  @param [out] pam  Pointer of address mode.
  @param [in] texRef  Pointer of texture reference.
@@ -9527,7 +10401,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Gets filter mode for a texture reference.
+    /** @brief Gets filter mode for a texture reference [Deprecated]
 
  @param [out] pfm  Pointer of filter mode.
  @param [in] texRef  Pointer of texture reference.
@@ -9543,7 +10417,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Gets flags for a texture reference.
+    /** @brief Gets flags for a texture reference [Deprecated]
 
  @param [out] pFlags  Pointer of flags.
  @param [in] texRef  Pointer of texture reference.
@@ -9559,7 +10433,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Gets texture format for a texture reference.
+    /** @brief Gets texture format for a texture reference [Deprecated]
 
  @param [out] pFormat  Pointer of the format.
  @param [out] pNumChannels  Pointer of number of channels.
@@ -9577,7 +10451,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Gets the maximum anisotropy for a texture reference.
+    /** @brief Gets the maximum anisotropy for a texture reference [Deprecated]
 
  @param [out] pmaxAnsio  Pointer of the maximum anisotropy.
  @param [in] texRef  Pointer of texture reference.
@@ -9593,7 +10467,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Gets the mipmap filter mode for a texture reference.
+    /** @brief Gets the mipmap filter mode for a texture reference [Deprecated]
 
  @param [out] pfm  Pointer of the mipmap filter mode.
  @param [in] texRef  Pointer of texture reference.
@@ -9609,7 +10483,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Gets the mipmap level bias for a texture reference.
+    /** @brief Gets the mipmap level bias for a texture reference [Deprecated]
 
  @param [out] pbias  Pointer of the mipmap level bias.
  @param [in] texRef  Pointer of texture reference.
@@ -9625,7 +10499,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Gets the minimum and maximum mipmap level clamps for a texture reference.
+    /** @brief Gets the minimum and maximum mipmap level clamps for a texture reference [Deprecated]
 
  @param [out] pminMipmapLevelClamp  Pointer of the minimum mipmap level clamp.
  @param [out] pmaxMipmapLevelClamp  Pointer of the maximum mipmap level clamp.
@@ -9643,7 +10517,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Gets the mipmapped array bound to a texture reference.
+    /** @brief Gets the mipmapped array bound to a texture reference [Deprecated]
 
  @param [out] pArray  Pointer of the mipmapped array.
  @param [in] texRef  Pointer of texture reference.
@@ -9659,7 +10533,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Sets an bound address for a texture reference.
+    /** @brief Sets an bound address for a texture reference [Deprecated]
 
  @param [out] ByteOffset  Pointer of the offset in bytes.
  @param [in] texRef  Pointer of texture reference.
@@ -9679,7 +10553,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Set a bind an address as a 2D texture reference.
+    /** @brief Set a bind an address as a 2D texture reference [Deprecated]
 
  @param [in] texRef  Pointer of texture reference.
  @param [in] desc  Pointer of array descriptor.
@@ -9699,7 +10573,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Sets the maximum anisotropy for a texture reference.
+    /** @brief Sets the maximum anisotropy for a texture reference [Deprecated]
 
  @param [in] texRef  Pointer of texture reference.
  @param [out] maxAniso  Value of the maximum anisotropy.
@@ -9715,7 +10589,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Sets border color for a texture reference.
+    /** @brief Sets border color for a texture reference [Deprecated]
 
  @param [in] texRef  Pointer of texture reference.
  @param [in] pBorderColor  Pointer of border color.
@@ -9731,7 +10605,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Sets mipmap filter mode for a texture reference.
+    /** @brief Sets mipmap filter mode for a texture reference [Deprecated]
 
  @param [in] texRef  Pointer of texture reference.
  @param [in] fm  Value of filter mode.
@@ -9747,7 +10621,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Sets mipmap level bias for a texture reference.
+    /** @brief Sets mipmap level bias for a texture reference [Deprecated]
 
  @param [in] texRef  Pointer of texture reference.
  @param [in] bias  Value of mipmap bias.
@@ -9763,7 +10637,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Sets mipmap level clamp for a texture reference.
+    /** @brief Sets mipmap level clamp for a texture reference [Deprecated]
 
  @param [in] texRef  Pointer of texture reference.
  @param [in] minMipMapLevelClamp  Value of minimum mipmap level clamp.
@@ -9781,7 +10655,7 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Binds mipmapped array to a texture reference.
+    /** @brief Binds mipmapped array to a texture reference [Deprecated]
 
  @param [in] texRef  Pointer of texture reference to bind.
  @param [in] mipmappedArray  Pointer of mipmapped array to bind.
@@ -9842,8 +10716,8 @@ extern "C" {
 
  @returns #hipSuccess, #hipErrorInvalidValue
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 */
     pub fn hipStreamBeginCapture(
         stream: hipStream_t,
@@ -9865,7 +10739,7 @@ are not safe.
 
  @returns #hipSuccess, #hipErrorInvalidValue
 
- @warning : param "const hipGraphEdgeData* dependencyData" is currently not supported and has to
+ @warning param "const hipGraphEdgeData* dependencyData" is currently not supported and has to be
 passed as nullptr. This API is marked as beta, meaning, while this is feature complete, it is still
 open to changes and may have outstanding issues.*/
     pub fn hipStreamBeginCaptureToGraph(
@@ -9882,12 +10756,12 @@ extern "C" {
     /** @brief Ends capture on a stream, returning the captured graph.
 
  @param [in] stream - Stream to end capture.
- @param [out] pGraph - returns the graph captured.
+ @param [out] pGraph - Captured graph.
 
  @returns #hipSuccess, #hipErrorInvalidValue
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 */
     pub fn hipStreamEndCapture(
         stream: hipStream_t,
@@ -9898,14 +10772,14 @@ extern "C" {
     #[must_use]
     /** @brief Get capture status of a stream.
 
- @param [in] stream - Stream under capture.
- @param [out] pCaptureStatus - returns current status of the capture.
- @param [out] pId - unique ID of the capture.
+ @param [in] stream - Stream of which to get capture status from.
+ @param [out] pCaptureStatus - Returns current capture status.
+ @param [out] pId - Unique capture ID.
 
  @returns #hipSuccess, #hipErrorStreamCaptureImplicit
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 */
     pub fn hipStreamGetCaptureInfo(
         stream: hipStream_t,
@@ -9917,17 +10791,17 @@ extern "C" {
     #[must_use]
     /** @brief Get stream's capture state
 
- @param [in] stream - Stream under capture.
- @param [out] captureStatus_out - returns current status of the capture.
- @param [out] id_out - unique ID of the capture.
- @param [in] graph_out - returns the graph being captured into.
- @param [out] dependencies_out - returns pointer to an array of nodes.
- @param [out] numDependencies_out - returns size of the array returned in dependencies_out.
+ @param [in] stream - Stream of which to get capture status from.
+ @param [out] captureStatus_out - Returns current capture status.
+ @param [out] id_out - Unique capture ID.
+ @param [out] graph_out - Returns the graph being captured into.
+ @param [out] dependencies_out - Pointer to an array of nodes representing the graphs dependencies.
+ @param [out] numDependencies_out - Returns size of the array returned in dependencies_out.
 
  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorStreamCaptureImplicit
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 */
     pub fn hipStreamGetCaptureInfo_v2(
         stream: hipStream_t,
@@ -9942,13 +10816,13 @@ extern "C" {
     #[must_use]
     /** @brief Get stream's capture state
 
- @param [in] stream - Stream under capture.
- @param [out] pCaptureStatus - returns current status of the capture.
+ @param [in] stream - Stream of which to get capture status from.
+ @param [out] pCaptureStatus - Returns current capture status.
 
  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorStreamCaptureImplicit
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 */
     pub fn hipStreamIsCapturing(
         stream: hipStream_t,
@@ -9959,15 +10833,15 @@ extern "C" {
     #[must_use]
     /** @brief Update the set of dependencies in a capturing stream
 
- @param [in] stream  Stream under capture.
- @param [in] dependencies  pointer to an array of nodes to Add/Replace.
- @param [in] numDependencies  size of the array in dependencies.
- @param [in] flags  Flag how to update dependency set. Should be one of value in enum
- #hipStreamUpdateCaptureDependenciesFlags
+ @param [in] stream  Stream that is being captured.
+ @param [in] dependencies  Pointer to an array of nodes to add/replace.
+ @param [in] numDependencies  Size of the dependencies array.
+ @param [in] flags  Flag to update dependency set. Should be one of the values
+ in enum #hipStreamUpdateCaptureDependenciesFlags.
  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorIllegalState
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 */
     pub fn hipStreamUpdateCaptureDependencies(
         stream: hipStream_t,
@@ -9980,11 +10854,11 @@ extern "C" {
     #[must_use]
     /** @brief Swaps the stream capture mode of a thread.
 
- @param [in] mode - Pointer to mode value to swap with the current mode
+ @param [in] mode - Pointer to mode value to swap with the current mode.
  @returns #hipSuccess, #hipErrorInvalidValue
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 */
     pub fn hipThreadExchangeStreamCaptureMode(
         mode: *mut hipStreamCaptureMode,
@@ -9999,8 +10873,8 @@ extern "C" {
 
  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryAllocation
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 */
     pub fn hipGraphCreate(
         pGraph: *mut hipGraph_t,
@@ -10015,8 +10889,8 @@ extern "C" {
 
  @returns #hipSuccess, #hipErrorInvalidValue
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 */
     pub fn hipGraphDestroy(graph: hipGraph_t) -> hipError_t;
 }
@@ -10024,14 +10898,14 @@ extern "C" {
     #[must_use]
     /** @brief Adds dependency edges to a graph.
 
- @param [in] graph - instance of the graph to add dependencies.
- @param [in] from - pointer to the graph nodes with dependenties to add from.
- @param [in] to - pointer to the graph nodes to add dependenties to.
- @param [in] numDependencies - the number of dependencies to add.
+ @param [in] graph - Instance of the graph to add dependencies to.
+ @param [in] from - Pointer to the graph nodes with dependencies to add from.
+ @param [in] to - Pointer to the graph nodes to add dependencies to.
+ @param [in] numDependencies - Number of dependencies to add.
  @returns #hipSuccess, #hipErrorInvalidValue
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 */
     pub fn hipGraphAddDependencies(
         graph: hipGraph_t,
@@ -10044,14 +10918,14 @@ extern "C" {
     #[must_use]
     /** @brief Removes dependency edges from a graph.
 
- @param [in] graph - instance of the graph to remove dependencies.
+ @param [in] graph - Instance of the graph to remove dependencies from.
  @param [in] from - Array of nodes that provide the dependencies.
  @param [in] to - Array of dependent nodes.
- @param [in] numDependencies - the number of dependencies to remove.
+ @param [in] numDependencies - Number of dependencies to remove.
  @returns #hipSuccess, #hipErrorInvalidValue
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 */
     pub fn hipGraphRemoveDependencies(
         graph: hipGraph_t,
@@ -10064,18 +10938,18 @@ extern "C" {
     #[must_use]
     /** @brief Returns a graph's dependency edges.
 
- @param [in] graph - instance of the graph to get the edges from.
- @param [out] from - pointer to the graph nodes to return edge endpoints.
- @param [out] to - pointer to the graph nodes to return edge endpoints.
- @param [out] numEdges - returns number of edges.
+ @param [in] graph - Instance of the graph to get the edges from.
+ @param [out] from - Pointer to the graph nodes to return edge endpoints.
+ @param [out] to - Pointer to the graph nodes to return edge endpoints.
+ @param [out] numEdges - Returns number of edges.
  @returns #hipSuccess, #hipErrorInvalidValue
 
  from and to may both be NULL, in which case this function only returns the number of edges in
  numEdges. Otherwise, numEdges entries will be filled in. If numEdges is higher than the actual
  number of edges, the remaining entries in from and to will be set to NULL, and the number of
- edges actually returned will be written to numEdges
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ edges actually returned will be written to numEdges.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 */
     pub fn hipGraphGetEdges(
         graph: hipGraph_t,
@@ -10086,19 +10960,19 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Returns graph nodes.
+    /** @brief Returns a graph's nodes.
 
- @param [in] graph - instance of graph to get the nodes.
- @param [out] nodes - pointer to return the  graph nodes.
- @param [out] numNodes - returns number of graph nodes.
+ @param [in] graph - Instance of graph to get the nodes from.
+ @param [out] nodes - Pointer to return the  graph nodes.
+ @param [out] numNodes - Returns the number of graph nodes.
  @returns #hipSuccess, #hipErrorInvalidValue
 
  nodes may be NULL, in which case this function will return the number of nodes in numNodes.
  Otherwise, numNodes entries will be filled in. If numNodes is higher than the actual number of
  nodes, the remaining entries in nodes will be set to NULL, and the number of nodes actually
  obtained will be returned in numNodes.
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 */
     pub fn hipGraphGetNodes(
         graph: hipGraph_t,
@@ -10108,19 +10982,19 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Returns graph's root nodes.
+    /** @brief Returns a graph's root nodes.
 
- @param [in] graph - instance of the graph to get the nodes.
- @param [out] pRootNodes - pointer to return the graph's root nodes.
- @param [out] pNumRootNodes - returns the number of graph's root nodes.
+ @param [in] graph - Instance of the graph to get the nodes from.
+ @param [out] pRootNodes - Pointer to return the graph's root nodes.
+ @param [out] pNumRootNodes - Returns the number of graph's root nodes.
  @returns #hipSuccess, #hipErrorInvalidValue
 
  pRootNodes may be NULL, in which case this function will return the number of root nodes in
  pNumRootNodes. Otherwise, pNumRootNodes entries will be filled in. If pNumRootNodes is higher
  than the actual number of root nodes, the remaining entries in pRootNodes will be set to NULL,
  and the number of nodes actually obtained will be returned in pNumRootNodes.
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 */
     pub fn hipGraphGetRootNodes(
         graph: hipGraph_t,
@@ -10132,17 +11006,17 @@ extern "C" {
     #[must_use]
     /** @brief Returns a node's dependencies.
 
- @param [in] node - graph node to get the dependencies from.
- @param [out] pDependencies - pointer to to return the dependencies.
- @param [out] pNumDependencies -  returns the number of graph node dependencies.
+ @param [in] node - Graph node to get the dependencies from.
+ @param [out] pDependencies - Pointer to return the dependencies.
+ @param [out] pNumDependencies -  Returns the number of graph node dependencies.
  @returns #hipSuccess, #hipErrorInvalidValue
 
  pDependencies may be NULL, in which case this function will return the number of dependencies in
  pNumDependencies. Otherwise, pNumDependencies entries will be filled in. If pNumDependencies is
  higher than the actual number of dependencies, the remaining entries in pDependencies will be set
  to NULL, and the number of nodes actually obtained will be returned in pNumDependencies.
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 */
     pub fn hipGraphNodeGetDependencies(
         node: hipGraphNode_t,
@@ -10154,18 +11028,18 @@ extern "C" {
     #[must_use]
     /** @brief Returns a node's dependent nodes.
 
- @param [in] node - graph node to get the Dependent nodes from.
- @param [out] pDependentNodes - pointer to return the graph dependent nodes.
- @param [out] pNumDependentNodes - returns the number of graph node dependent nodes.
+ @param [in] node - Graph node to get the dependent nodes from.
+ @param [out] pDependentNodes - Pointer to return the graph dependent nodes.
+ @param [out] pNumDependentNodes - Returns the number of graph node dependent nodes.
  @returns #hipSuccess, #hipErrorInvalidValue
 
- DependentNodes may be NULL, in which case this function will return the number of dependent nodes
+ pDependentNodes may be NULL, in which case this function will return the number of dependent nodes
  in pNumDependentNodes. Otherwise, pNumDependentNodes entries will be filled in. If
  pNumDependentNodes is higher than the actual number of dependent nodes, the remaining entries in
  pDependentNodes will be set to NULL, and the number of nodes actually obtained will be returned
  in pNumDependentNodes.
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 */
     pub fn hipGraphNodeGetDependentNodes(
         node: hipGraphNode_t,
@@ -10177,12 +11051,12 @@ extern "C" {
     #[must_use]
     /** @brief Returns a node's type.
 
- @param [in] node - instance of the graph to add dependencies.
- @param [out] pType - pointer to the return the type
+ @param [in] node - Node to get type of.
+ @param [out] pType - Returns the node's type.
  @returns #hipSuccess, #hipErrorInvalidValue
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 */
     pub fn hipGraphNodeGetType(
         node: hipGraphNode_t,
@@ -10196,8 +11070,8 @@ extern "C" {
  @param [in] node - graph node to remove
  @returns #hipSuccess, #hipErrorInvalidValue
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 */
     pub fn hipGraphDestroyNode(node: hipGraphNode_t) -> hipError_t;
 }
@@ -10209,8 +11083,8 @@ extern "C" {
  @param [in] originalGraph - original graph to clone from.
  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryAllocation
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 */
     pub fn hipGraphClone(
         pGraphClone: *mut hipGraph_t,
@@ -10226,8 +11100,8 @@ extern "C" {
  @param [in] clonedGraph - Cloned graph to query.
  @returns #hipSuccess, #hipErrorInvalidValue
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 */
     pub fn hipGraphNodeFindInClone(
         pNode: *mut hipGraphNode_t,
@@ -10239,17 +11113,17 @@ extern "C" {
     #[must_use]
     /** @brief Creates an executable graph from a graph
 
- @param [out] pGraphExec - pointer to instantiated executable graph that is created.
- @param [in] graph - instance of graph to instantiate.
- @param [out] pErrorNode - pointer to error node in case error occured in graph instantiation,
-  it could modify the correponding node.
- @param [out] pLogBuffer - pointer to log buffer.
- @param [out] bufferSize - the size of log buffer.
+ @param [out] pGraphExec - Pointer to instantiated executable graph.
+ @param [in] graph - Instance of graph to instantiate.
+ @param [out] pErrorNode - Pointer to error node. In case an error occured during
+ graph instantiation, it could modify the corresponding node.
+ @param [out] pLogBuffer - Pointer to log buffer.
+ @param [out] bufferSize - Size of the log buffer.
 
  @returns #hipSuccess, #hipErrorOutOfMemory
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 */
     pub fn hipGraphInstantiate(
         pGraphExec: *mut hipGraphExec_t,
@@ -10263,14 +11137,14 @@ extern "C" {
     #[must_use]
     /** @brief Creates an executable graph from a graph.
 
- @param [out] pGraphExec - pointer to instantiated executable graph that is created.
- @param [in] graph - instance of graph to instantiate.
+ @param [out] pGraphExec - Pointer to instantiated executable graph.
+ @param [in] graph - Instance of graph to instantiate.
  @param [in] flags - Flags to control instantiation.
  @returns #hipSuccess, #hipErrorInvalidValue
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.It does not support
- any of flag and is behaving as hipGraphInstantiate.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues. It does not support any of
+          flag and is behaving as hipGraphInstantiate.*/
     pub fn hipGraphInstantiateWithFlags(
         pGraphExec: *mut hipGraphExec_t,
         graph: hipGraph_t,
@@ -10281,13 +11155,13 @@ extern "C" {
     #[must_use]
     /** @brief Creates an executable graph from a graph.
 
- @param [out] pGraphExec - pointer to instantiated executable graph that is created.
- @param [in] graph - instance of graph to instantiate.
- @param [in] instantiateParams - Graph Instantiate Params
+ @param [out] pGraphExec - Pointer to instantiated executable graph.
+ @param [in] graph - Instance of graph to instantiate.
+ @param [in] instantiateParams - Graph instantiation Params
  @returns #hipSuccess, #hipErrorInvalidValue
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphInstantiateWithParams(
         pGraphExec: *mut hipGraphExec_t,
         graph: hipGraph_t,
@@ -10296,40 +11170,40 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief launches an executable graph in a stream
+    /** @brief Launches an executable graph in the specified stream.
 
- @param [in] graphExec - instance of executable graph to launch.
- @param [in] stream - instance of stream in which to launch executable graph.
+ @param [in] graphExec - Instance of executable graph to launch.
+ @param [in] stream - Instance of stream in which to launch executable graph.
  @returns #hipSuccess, #hipErrorInvalidValue
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphLaunch(graphExec: hipGraphExec_t, stream: hipStream_t) -> hipError_t;
 }
 extern "C" {
     #[must_use]
-    /** @brief uploads an executable graph in a stream
+    /** @brief Uploads an executable graph to a stream
 
- @param [in] graphExec - instance of executable graph to launch.
- @param [in] stream - instance of stream in which to launch executable graph.
+ @param [in] graphExec - Instance of executable graph to be uploaded.
+ @param [in] stream - Instance of stream to which the executable graph is uploaded to.
  @returns #hipSuccess, #hipErrorInvalidValue
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphUpload(graphExec: hipGraphExec_t, stream: hipStream_t) -> hipError_t;
 }
 extern "C" {
     #[must_use]
     /** @brief Creates a kernel execution node and adds it to a graph.
 
- @param [out] pGraphNode - pointer to graph node to create.
- @param [in] graph - instance of graph to add the created node.
- @param [in] pDependencies - pointer to the dependencies on the kernel execution node.
- @param [in] numDependencies - the number of the dependencies.
- @param [in] nodeParams - pointer to the parameters for the node.
+ @param [out] pGraphNode - Pointer to kernel graph node that is created.
+ @param [in] graph - Instance of graph to add the created node to.
+ @param [in] pDependencies - Pointer to the dependencies on the kernel execution node.
+ @param [in] numDependencies - Number of dependencies.
+ @param [in] nodeParams - Pointer to the node parameters.
  @returns #hipSuccess, #hipErrorInvalidValue.
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphAddNode(
         pGraphNode: *mut hipGraphNode_t,
         graph: hipGraph_t,
@@ -10338,16 +11212,60 @@ extern "C" {
         nodeParams: *mut hipGraphNodeParams,
     ) -> hipError_t;
 }
+extern "C" {
+    #[must_use]
+    /** @brief Return the flags of an executable graph.
+
+ @param [in] graphExec - Executable graph to get the flags from.
+ @param [out] flags - Flags used to instantiate this executable graph.
+ @returns #hipSuccess, #hipErrorInvalidValue.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
+    pub fn hipGraphExecGetFlags(
+        graphExec: hipGraphExec_t,
+        flags: *mut ::core::ffi::c_ulonglong,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[must_use]
+    /** @brief Updates parameters of a graph's node.
+
+ @param [in] node - Instance of the node to set parameters for.
+ @param [in] nodeParams - Pointer to the parameters to be set.
+ @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDeviceFunction, #hipErrorNotSupported.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
+    pub fn hipGraphNodeSetParams(
+        node: hipGraphNode_t,
+        nodeParams: *mut hipGraphNodeParams,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[must_use]
+    /** @brief Updates parameters of an executable graph's node.
+
+ @param [in] graphExec - Instance of the executable graph.
+ @param [in] node - Instance of the node to set parameters to.
+ @param [in] nodeParams - Pointer to the parameters to be set.
+ @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDeviceFunction, #hipErrorNotSupported.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
+    pub fn hipGraphExecNodeSetParams(
+        graphExec: hipGraphExec_t,
+        node: hipGraphNode_t,
+        nodeParams: *mut hipGraphNodeParams,
+    ) -> hipError_t;
+}
 extern "C" {
     #[must_use]
     /** @brief Destroys an executable graph
 
- @param [in] graphExec - instance of executable graph to destry.
+ @param [in] graphExec - Instance of executable graph to destroy.
 
  @returns #hipSuccess.
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphExecDestroy(graphExec: hipGraphExec_t) -> hipError_t;
 }
 extern "C" {
@@ -10358,11 +11276,11 @@ extern "C" {
  @param [in] hGraphExec - instance of executable graph to update.
  @param [in] hGraph - graph that contains the updated parameters.
  @param [in] hErrorNode_out -  node which caused the permissibility check to forbid the update.
- @param [in] updateResult_out - Whether the graph update was permitted.
+ @param [in] updateResult_out - Return code whether the graph update was performed.
  @returns #hipSuccess, #hipErrorGraphExecUpdateFailure
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphExecUpdate(
         hGraphExec: hipGraphExec_t,
         hGraph: hipGraph_t,
@@ -10374,14 +11292,14 @@ extern "C" {
     #[must_use]
     /** @brief Creates a kernel execution node and adds it to a graph.
 
- @param [out] pGraphNode - pointer to graph node to create.
- @param [in] graph - instance of graph to add the created node.
- @param [in] pDependencies - pointer to the dependencies on the kernel execution node.
- @param [in] numDependencies - the number of the dependencies.
- @param [in] pNodeParams - pointer to the parameters to the kernel execution node on the GPU.
+ @param [out] pGraphNode - Pointer to graph node that is created
+ @param [in] graph - Instance of graph to add the created node to.
+ @param [in] pDependencies - Pointer to the dependencies of the kernel execution node.
+ @param [in] numDependencies - The number of the dependencies.
+ @param [in] pNodeParams - Pointer to the parameters of the kernel execution node.
  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDeviceFunction
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphAddKernelNode(
         pGraphNode: *mut hipGraphNode_t,
         graph: hipGraph_t,
@@ -10397,8 +11315,8 @@ extern "C" {
  @param [in] node - instance of the node to get parameters from.
  @param [out] pNodeParams - pointer to the parameters
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphKernelNodeGetParams(
         node: hipGraphNode_t,
         pNodeParams: *mut hipKernelNodeParams,
@@ -10408,11 +11326,11 @@ extern "C" {
     #[must_use]
     /** @brief Sets a kernel node's parameters.
 
- @param [in] node - instance of the node to set parameters to.
+ @param [in] node - Instance of the node to set parameters of.
  @param [in] pNodeParams - const pointer to the parameters.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphKernelNodeSetParams(
         node: hipGraphNode_t,
         pNodeParams: *const hipKernelNodeParams,
@@ -10422,12 +11340,12 @@ extern "C" {
     #[must_use]
     /** @brief Sets the parameters for a kernel node in the given graphExec.
 
- @param [in] hGraphExec - instance of the executable graph with the node.
- @param [in] node - instance of the node to set parameters to.
+ @param [in] hGraphExec - Instance of the executable graph with the node.
+ @param [in] node - Instance of the node to set parameters of.
  @param [in] pNodeParams - const pointer to the kernel node parameters.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphExecKernelNodeSetParams(
         hGraphExec: hipGraphExec_t,
         node: hipGraphNode_t,
@@ -10438,15 +11356,15 @@ extern "C" {
     #[must_use]
     /** @brief Creates a memcpy node and adds it to a graph.
 
- @param [out] phGraphNode - pointer to graph node to create.
- @param [in] hGraph - instance of graph to add the created node.
- @param [in] dependencies - const pointer to the dependencies on the memcpy execution node.
- @param [in] numDependencies - the number of the dependencies.
+ @param [out] phGraphNode - Pointer to graph node that is created.
+ @param [in] hGraph - Instance of graph to add the created node to.
+ @param [in] dependencies - const pointer to the dependencies of the memcpy execution node.
+ @param [in] numDependencies - The number of dependencies.
  @param [in] copyParams - const pointer to the parameters for the memory copy.
- @param [in] ctx - cotext related to current device.
+ @param [in] ctx - context related to current device.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipDrvGraphAddMemcpyNode(
         phGraphNode: *mut hipGraphNode_t,
         hGraph: hipGraph_t,
@@ -10460,14 +11378,14 @@ extern "C" {
     #[must_use]
     /** @brief Creates a memcpy node and adds it to a graph.
 
- @param [out] pGraphNode - pointer to graph node to create.
- @param [in] graph - instance of graph to add the created node.
- @param [in] pDependencies - const pointer to the dependencies on the memcpy execution node.
- @param [in] numDependencies - the number of the dependencies.
+ @param [out] pGraphNode - Pointer to graph node that is created.
+ @param [in] graph - Instance of graph to add the created node to.
+ @param [in] pDependencies - const pointer to the dependencies of the memcpy execution node.
+ @param [in] numDependencies - The number of dependencies.
  @param [in] pCopyParams - const pointer to the parameters for the memory copy.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphAddMemcpyNode(
         pGraphNode: *mut hipGraphNode_t,
         graph: hipGraph_t,
@@ -10483,8 +11401,8 @@ extern "C" {
  @param [in] node - instance of the node to get parameters from.
  @param [out] pNodeParams - pointer to the parameters.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphMemcpyNodeGetParams(
         node: hipGraphNode_t,
         pNodeParams: *mut hipMemcpy3DParms,
@@ -10497,8 +11415,8 @@ extern "C" {
  @param [in] node - instance of the node to set parameters to.
  @param [in] pNodeParams - const pointer to the parameters.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphMemcpyNodeSetParams(
         node: hipGraphNode_t,
         pNodeParams: *const hipMemcpy3DParms,
@@ -10506,14 +11424,14 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Sets a node attribute.
+    /** @brief Sets a node's attribute.
 
- @param [in] hNode - instance of the node to set parameters to.
- @param [in] attr - the attribute node is set to.
+ @param [in] hNode - Instance of the node to set parameters of.
+ @param [in] attr - The attribute type to be set.
  @param [in] value - const pointer to the parameters.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphKernelNodeSetAttribute(
         hNode: hipGraphNode_t,
         attr: hipLaunchAttributeID,
@@ -10522,14 +11440,14 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Gets a node attribute.
+    /** @brief Gets a node's attribute.
 
- @param [in] hNode - instance of the node to set parameters to.
- @param [in] attr - the attribute node is set to.
+ @param [in] hNode - Instance of the node to set parameters of.
+ @param [in] attr - The attribute type to be set.
  @param [in] value - const pointer to the parameters.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphKernelNodeGetAttribute(
         hNode: hipGraphNode_t,
         attr: hipLaunchAttributeID,
@@ -10538,14 +11456,14 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Sets the parameters for a memcpy node in the given graphExec.
+    /** @brief Sets the parameters of a memcpy node in the given graphExec.
 
- @param [in] hGraphExec - instance of the executable graph with the node.
- @param [in] node - instance of the node to set parameters to.
+ @param [in] hGraphExec - Instance of the executable graph with the node.
+ @param [in] node - Instance of the node to set parameters of.
  @param [in] pNodeParams - const pointer to the kernel node parameters.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphExecMemcpyNodeSetParams(
         hGraphExec: hipGraphExec_t,
         node: hipGraphNode_t,
@@ -10556,17 +11474,17 @@ extern "C" {
     #[must_use]
     /** @brief Creates a 1D memcpy node and adds it to a graph.
 
- @param [out] pGraphNode - pointer to graph node to create.
- @param [in] graph - instance of graph to add the created node.
- @param [in] pDependencies - const pointer to the dependencies on the memcpy execution node.
- @param [in] numDependencies - the number of the dependencies.
- @param [in] dst - pointer to memory address to the destination.
- @param [in] src - pointer to memory address to the source.
- @param [in] count - the size of the memory to copy.
- @param [in] kind - the type of memory copy.
+ @param [out] pGraphNode - Pointer to graph node that is created.
+ @param [in] graph - Instance of graph to add the created node to.
+ @param [in] pDependencies - const pointer to the dependencies of the memcpy execution node.
+ @param [in] numDependencies - The number of dependencies.
+ @param [in] dst - Pointer to memory address of the destination.
+ @param [in] src - Pointer to memory address of the source.
+ @param [in] count - Size of the memory to copy.
+ @param [in] kind - Type of memory copy.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphAddMemcpyNode1D(
         pGraphNode: *mut hipGraphNode_t,
         graph: hipGraph_t,
@@ -10582,14 +11500,14 @@ extern "C" {
     #[must_use]
     /** @brief Sets a memcpy node's parameters to perform a 1-dimensional copy.
 
- @param [in] node - instance of the node to set parameters to.
- @param [in] dst - pointer to memory address to the destination.
- @param [in] src - pointer to memory address to the source.
- @param [in] count - the size of the memory to copy.
- @param [in] kind - the type of memory copy.
+ @param [in] node - Instance of the node to set parameters of.
+ @param [in] dst - Pointer to memory address of the destination.
+ @param [in] src - Pointer to memory address of the source.
+ @param [in] count - Size of the memory to copy.
+ @param [in] kind - Type of memory copy.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphMemcpyNodeSetParams1D(
         node: hipGraphNode_t,
         dst: *mut ::core::ffi::c_void,
@@ -10603,15 +11521,15 @@ extern "C" {
     /** @brief Sets the parameters for a memcpy node in the given graphExec to perform a 1-dimensional
  copy.
 
- @param [in] hGraphExec - instance of the executable graph with the node.
- @param [in] node - instance of the node to set parameters to.
- @param [in] dst - pointer to memory address to the destination.
- @param [in] src - pointer to memory address to the source.
- @param [in] count - the size of the memory to copy.
- @param [in] kind - the type of memory copy.
+ @param [in] hGraphExec - Instance of the executable graph with the node.
+ @param [in] node - Instance of the node to set parameters of.
+ @param [in] dst - Pointer to memory address of the destination.
+ @param [in] src - Pointer to memory address of the source.
+ @param [in] count - Size of the memory to copy.
+ @param [in] kind - Type of memory copy.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphExecMemcpyNodeSetParams1D(
         hGraphExec: hipGraphExec_t,
         node: hipGraphNode_t,
@@ -10625,18 +11543,18 @@ extern "C" {
     #[must_use]
     /** @brief Creates a memcpy node to copy from a symbol on the device and adds it to a graph.
 
- @param [out] pGraphNode - pointer to graph node to create.
- @param [in] graph - instance of graph to add the created node.
- @param [in] pDependencies - const pointer to the dependencies on the memcpy execution node.
- @param [in] numDependencies - the number of the dependencies.
- @param [in] dst - pointer to memory address to the destination.
+ @param [out] pGraphNode - Pointer to graph node that is created.
+ @param [in] graph - Instance of graph to add the created node to.
+ @param [in] pDependencies - const pointer to the dependencies of the memcpy execution node.
+ @param [in] numDependencies - Number of the dependencies.
+ @param [in] dst - Pointer to memory address of the destination.
  @param [in] symbol - Device symbol address.
- @param [in] count - the size of the memory to copy.
+ @param [in] count - Size of the memory to copy.
  @param [in] offset - Offset from start of symbol in bytes.
- @param [in] kind - the type of memory copy.
+ @param [in] kind - Type of memory copy.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphAddMemcpyNodeFromSymbol(
         pGraphNode: *mut hipGraphNode_t,
         graph: hipGraph_t,
@@ -10653,15 +11571,15 @@ extern "C" {
     #[must_use]
     /** @brief Sets a memcpy node's parameters to copy from a symbol on the device.
 
- @param [in] node - instance of the node to set parameters to.
- @param [in] dst - pointer to memory address to the destination.
+ @param [in] node - Instance of the node to set parameters of.
+ @param [in] dst - Pointer to memory address of the destination.
  @param [in] symbol - Device symbol address.
- @param [in] count - the size of the memory to copy.
+ @param [in] count - Size of the memory to copy.
  @param [in] offset - Offset from start of symbol in bytes.
- @param [in] kind - the type of memory copy.
+ @param [in] kind - Type of memory copy.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphMemcpyNodeSetParamsFromSymbol(
         node: hipGraphNode_t,
         dst: *mut ::core::ffi::c_void,
@@ -10676,16 +11594,16 @@ extern "C" {
     /** @brief Sets the parameters for a memcpy node in the given graphExec to copy from a symbol on the
  * device.
 
- @param [in] hGraphExec - instance of the executable graph with the node.
- @param [in] node - instance of the node to set parameters to.
- @param [in] dst - pointer to memory address to the destination.
+ @param [in] hGraphExec - Instance of the executable graph with the node.
+ @param [in] node - Instance of the node to set parameters of.
+ @param [in] dst - Pointer to memory address of the destination.
  @param [in] symbol - Device symbol address.
- @param [in] count - the size of the memory to copy.
+ @param [in] count - Size of the memory to copy.
  @param [in] offset - Offset from start of symbol in bytes.
- @param [in] kind - the type of memory copy.
+ @param [in] kind - Type of memory copy.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphExecMemcpyNodeSetParamsFromSymbol(
         hGraphExec: hipGraphExec_t,
         node: hipGraphNode_t,
@@ -10700,18 +11618,18 @@ extern "C" {
     #[must_use]
     /** @brief Creates a memcpy node to copy to a symbol on the device and adds it to a graph.
 
- @param [out] pGraphNode - pointer to graph node to create.
- @param [in] graph - instance of graph to add the created node.
+ @param [out] pGraphNode - Pointer to graph node that is created.
+ @param [in] graph - Instance of graph to add the created node to.
  @param [in] pDependencies - const pointer to the dependencies on the memcpy execution node.
- @param [in] numDependencies - the number of the dependencies.
+ @param [in] numDependencies - Number of dependencies.
  @param [in] symbol - Device symbol address.
- @param [in] src - pointer to memory address of the src.
- @param [in] count - the size of the memory to copy.
+ @param [in] src - Pointer to memory address of the src.
+ @param [in] count - Size of the memory to copy.
  @param [in] offset - Offset from start of symbol in bytes.
- @param [in] kind - the type of memory copy.
+ @param [in] kind - Type of memory copy.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphAddMemcpyNodeToSymbol(
         pGraphNode: *mut hipGraphNode_t,
         graph: hipGraph_t,
@@ -10728,15 +11646,15 @@ extern "C" {
     #[must_use]
     /** @brief Sets a memcpy node's parameters to copy to a symbol on the device.
 
- @param [in] node - instance of the node to set parameters to.
+ @param [in] node - Instance of the node to set parameters of.
  @param [in] symbol - Device symbol address.
- @param [in] src - pointer to memory address of the src.
- @param [in] count - the size of the memory to copy.
+ @param [in] src - Pointer to memory address of the src.
+ @param [in] count - Size of the memory to copy.
  @param [in] offset - Offset from start of symbol in bytes.
- @param [in] kind - the type of memory copy.
+ @param [in] kind - Type of memory copy.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphMemcpyNodeSetParamsToSymbol(
         node: hipGraphNode_t,
         symbol: *const ::core::ffi::c_void,
@@ -10750,16 +11668,16 @@ extern "C" {
     #[must_use]
     /** @brief Sets the parameters for a memcpy node in the given graphExec to copy to a symbol on the
  device.
- @param [in] hGraphExec - instance of the executable graph with the node.
- @param [in] node - instance of the node to set parameters to.
+ @param [in] hGraphExec - Instance of the executable graph with the node.
+ @param [in] node - Instance of the node to set parameters of.
  @param [in] symbol - Device symbol address.
- @param [in] src - pointer to memory address of the src.
- @param [in] count - the size of the memory to copy.
+ @param [in] src - Pointer to memory address of the src.
+ @param [in] count - Size of the memory to copy.
  @param [in] offset - Offset from start of symbol in bytes.
- @param [in] kind - the type of memory copy.
+ @param [in] kind - Type of memory copy.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphExecMemcpyNodeSetParamsToSymbol(
         hGraphExec: hipGraphExec_t,
         node: hipGraphNode_t,
@@ -10774,14 +11692,14 @@ extern "C" {
     #[must_use]
     /** @brief Creates a memset node and adds it to a graph.
 
- @param [out] pGraphNode - pointer to the graph node to create.
- @param [in] graph - instance of the graph to add the created node.
+ @param [out] pGraphNode - Pointer to graph node that is created.
+ @param [in] graph - Instance of the graph to add the created node to.
  @param [in] pDependencies - const pointer to the dependencies on the memset execution node.
- @param [in] numDependencies - the number of the dependencies.
+ @param [in] numDependencies - Number of dependencies.
  @param [in] pMemsetParams - const pointer to the parameters for the memory set.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphAddMemsetNode(
         pGraphNode: *mut hipGraphNode_t,
         graph: hipGraph_t,
@@ -10794,11 +11712,11 @@ extern "C" {
     #[must_use]
     /** @brief Gets a memset node's parameters.
 
- @param [in] node - instane of the node to get parameters from.
- @param [out] pNodeParams - pointer to the parameters.
+ @param [in] node - Instance of the node to get parameters of.
+ @param [out] pNodeParams - Pointer to the parameters.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphMemsetNodeGetParams(
         node: hipGraphNode_t,
         pNodeParams: *mut hipMemsetParams,
@@ -10808,11 +11726,11 @@ extern "C" {
     #[must_use]
     /** @brief Sets a memset node's parameters.
 
- @param [in] node - instance of the node to set parameters to.
- @param [in] pNodeParams - pointer to the parameters.
+ @param [in] node - Instance of the node to set parameters of.
+ @param [in] pNodeParams - Pointer to the parameters.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphMemsetNodeSetParams(
         node: hipGraphNode_t,
         pNodeParams: *const hipMemsetParams,
@@ -10822,12 +11740,12 @@ extern "C" {
     #[must_use]
     /** @brief Sets the parameters for a memset node in the given graphExec.
 
- @param [in] hGraphExec - instance of the executable graph with the node.
- @param [in] node - instance of the node to set parameters to.
- @param [in] pNodeParams - pointer to the parameters.
+ @param [in] hGraphExec - Instance of the executable graph with the node.
+ @param [in] node - Instance of the node to set parameters of.
+ @param [in] pNodeParams - Pointer to the parameters.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphExecMemsetNodeSetParams(
         hGraphExec: hipGraphExec_t,
         node: hipGraphNode_t,
@@ -10838,14 +11756,14 @@ extern "C" {
     #[must_use]
     /** @brief Creates a host execution node and adds it to a graph.
 
- @param [out] pGraphNode - pointer to the graph node to create.
- @param [in] graph - instance of the graph to add the created node.
- @param [in] pDependencies - const pointer to the dependencies on the memset execution node.
- @param [in] numDependencies - the number of the dependencies.
- @param [in] pNodeParams -pointer to the parameters.
+ @param [out] pGraphNode - Pointer to graph node that is created.
+ @param [in] graph - Instance of the graph to add the created node to.
+ @param [in] pDependencies - const pointer to the dependencies of the memset execution node.
+ @param [in] numDependencies - Number of dependencies.
+ @param [in] pNodeParams - Pointer to the parameters.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphAddHostNode(
         pGraphNode: *mut hipGraphNode_t,
         graph: hipGraph_t,
@@ -10858,11 +11776,11 @@ extern "C" {
     #[must_use]
     /** @brief Returns a host node's parameters.
 
- @param [in] node - instane of the node to get parameters from.
- @param [out] pNodeParams - pointer to the parameters.
+ @param [in] node - Instance of the node to get parameters of.
+ @param [out] pNodeParams - Pointer to the parameters.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphHostNodeGetParams(
         node: hipGraphNode_t,
         pNodeParams: *mut hipHostNodeParams,
@@ -10872,11 +11790,11 @@ extern "C" {
     #[must_use]
     /** @brief Sets a host node's parameters.
 
- @param [in] node - instance of the node to set parameters to.
- @param [in] pNodeParams - pointer to the parameters.
+ @param [in] node - Instance of the node to set parameters of.
+ @param [in] pNodeParams - Pointer to the parameters.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphHostNodeSetParams(
         node: hipGraphNode_t,
         pNodeParams: *const hipHostNodeParams,
@@ -10886,12 +11804,12 @@ extern "C" {
     #[must_use]
     /** @brief Sets the parameters for a host node in the given graphExec.
 
- @param [in] hGraphExec - instance of the executable graph with the node.
- @param [in] node - instance of the node to set parameters to.
- @param [in] pNodeParams - pointer to the parameters.
+ @param [in] hGraphExec - Instance of the executable graph with the node.
+ @param [in] node - Instance of the node to set parameters of.
+ @param [in] pNodeParams - Pointer to the parameters.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphExecHostNodeSetParams(
         hGraphExec: hipGraphExec_t,
         node: hipGraphNode_t,
@@ -10902,14 +11820,14 @@ extern "C" {
     #[must_use]
     /** @brief Creates a child graph node and adds it to a graph.
 
- @param [out] pGraphNode - pointer to the graph node to create.
- @param [in] graph - instance of the graph to add the created node.
- @param [in] pDependencies - const pointer to the dependencies on the memset execution node.
- @param [in] numDependencies - the number of the dependencies.
- @param [in] childGraph - the graph to clone into this node
+ @param [out] pGraphNode - Pointer to graph node that is created.
+ @param [in] graph - Instance of the graph to add the created node.
+ @param [in] pDependencies - const pointer to the dependencies of the memset execution node.
+ @param [in] numDependencies - Number of dependencies.
+ @param [in] childGraph - Graph to clone into this node
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphAddChildGraphNode(
         pGraphNode: *mut hipGraphNode_t,
         graph: hipGraph_t,
@@ -10922,11 +11840,11 @@ extern "C" {
     #[must_use]
     /** @brief Gets a handle to the embedded graph of a child graph node.
 
- @param [in] node - instane of the node to get child graph.
- @param [out] pGraph - pointer to get the graph.
+ @param [in] node - Instance of the node to get child graph of.
+ @param [out] pGraph - Pointer to get the graph.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphChildGraphNodeGetGraph(
         node: hipGraphNode_t,
         pGraph: *mut hipGraph_t,
@@ -10940,8 +11858,8 @@ extern "C" {
  @param [in] node - node from the graph which was used to instantiate graphExec.
  @param [in] childGraph - child graph with updated parameters.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphExecChildGraphNodeSetParams(
         hGraphExec: hipGraphExec_t,
         node: hipGraphNode_t,
@@ -10952,13 +11870,13 @@ extern "C" {
     #[must_use]
     /** @brief Creates an empty node and adds it to a graph.
 
- @param [out] pGraphNode - pointer to the graph node to create and add to the graph.
- @param [in] graph - instane of the graph the node is add to.
- @param [in] pDependencies - const pointer to the node dependenties.
- @param [in] numDependencies - the number of dependencies.
+ @param [out] pGraphNode - Pointer to graph node that is created.
+ @param [in] graph - Instance of the graph the node is added to.
+ @param [in] pDependencies - const pointer to the node dependencies.
+ @param [in] numDependencies - Number of dependencies.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphAddEmptyNode(
         pGraphNode: *mut hipGraphNode_t,
         graph: hipGraph_t,
@@ -10970,14 +11888,14 @@ extern "C" {
     #[must_use]
     /** @brief Creates an event record node and adds it to a graph.
 
- @param [out] pGraphNode - pointer to the graph node to create and add to the graph.
- @param [in] graph - instane of the graph the node to be added.
- @param [in] pDependencies - const pointer to the node dependenties.
- @param [in] numDependencies - the number of dependencies.
- @param [in] event - Event for the node.
+ @param [out] pGraphNode - Pointer to graph node that is created.
+ @param [in] graph - Instance of the graph the node is added to.
+ @param [in] pDependencies - const pointer to the node dependencies.
+ @param [in] numDependencies - Number of dependencies.
+ @param [in] event - Event of the node.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphAddEventRecordNode(
         pGraphNode: *mut hipGraphNode_t,
         graph: hipGraph_t,
@@ -10990,11 +11908,11 @@ extern "C" {
     #[must_use]
     /** @brief Returns the event associated with an event record node.
 
- @param [in] node -  instane of the node to get event from.
+ @param [in] node -  Instance of the node to get event of.
  @param [out] event_out - Pointer to return the event.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphEventRecordNodeGetEvent(
         node: hipGraphNode_t,
         event_out: *mut hipEvent_t,
@@ -11004,11 +11922,11 @@ extern "C" {
     #[must_use]
     /** @brief Sets an event record node's event.
 
- @param [in] node - instane of the node to set event to.
- @param [in] event - pointer to the event.
+ @param [in] node - Instance of the node to set event to.
+ @param [in] event - Pointer to the event.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphEventRecordNodeSetEvent(
         node: hipGraphNode_t,
         event: hipEvent_t,
@@ -11022,8 +11940,8 @@ extern "C" {
  @param [in] hNode - node from the graph which was used to instantiate graphExec.
  @param [in] event - pointer to the event.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphExecEventRecordNodeSetEvent(
         hGraphExec: hipGraphExec_t,
         hNode: hipGraphNode_t,
@@ -11034,14 +11952,14 @@ extern "C" {
     #[must_use]
     /** @brief Creates an event wait node and adds it to a graph.
 
- @param [out] pGraphNode - pointer to the graph node to create and add to the graph.
- @param [in] graph - instane of the graph the node to be added.
- @param [in] pDependencies - const pointer to the node dependenties.
- @param [in] numDependencies - the number of dependencies.
+ @param [out] pGraphNode - Pointer to graph node that is created.
+ @param [in] graph - Instance of the graph the node to be added.
+ @param [in] pDependencies - const pointer to the node dependencies.
+ @param [in] numDependencies - Number of dependencies.
  @param [in] event - Event for the node.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphAddEventWaitNode(
         pGraphNode: *mut hipGraphNode_t,
         graph: hipGraph_t,
@@ -11054,11 +11972,11 @@ extern "C" {
     #[must_use]
     /** @brief Returns the event associated with an event wait node.
 
- @param [in] node -  instane of the node to get event from.
+ @param [in] node -  Instance of the node to get event of.
  @param [out] event_out - Pointer to return the event.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphEventWaitNodeGetEvent(
         node: hipGraphNode_t,
         event_out: *mut hipEvent_t,
@@ -11068,11 +11986,11 @@ extern "C" {
     #[must_use]
     /** @brief Sets an event wait node's event.
 
- @param [in] node - instane of the node to set event to.
- @param [in] event - pointer to the event.
+ @param [in] node - Instance of the node to set event of.
+ @param [in] event - Pointer to the event.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphEventWaitNodeSetEvent(
         node: hipGraphNode_t,
         event: hipEvent_t,
@@ -11086,8 +12004,8 @@ extern "C" {
  @param [in] hNode - node from the graph which was used to instantiate graphExec.
  @param [in] event - pointer to the event.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphExecEventWaitNodeSetEvent(
         hGraphExec: hipGraphExec_t,
         hNode: hipGraphNode_t,
@@ -11099,13 +12017,13 @@ extern "C" {
     /** @brief Creates a memory allocation node and adds it to a graph
 
  @param [out] pGraphNode      - Pointer to the graph node to create and add to the graph
- @param [in] graph            - Instane of the graph the node to be added
- @param [in] pDependencies    - Const pointer to the node dependenties
+ @param [in] graph            - Instance of the graph node to be added
+ @param [in] pDependencies    - Const pointer to the node dependencies
  @param [in] numDependencies  - The number of dependencies
- @param [in] pNodeParams      - Node parameters for memory allocation
+ @param [in, out] pNodeParams - Node parameters for memory allocation, returns a pointer to the allocated memory.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphAddMemAllocNode(
         pGraphNode: *mut hipGraphNode_t,
         graph: hipGraph_t,
@@ -11118,11 +12036,11 @@ extern "C" {
     #[must_use]
     /** @brief Returns parameters for memory allocation node
 
- @param [in] node         - Memory allocation node for a query
+ @param [in] node         - Memory allocation node to query
  @param [out] pNodeParams - Parameters for the specified memory allocation node
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphMemAllocNodeGetParams(
         node: hipGraphNode_t,
         pNodeParams: *mut hipMemAllocNodeParams,
@@ -11133,13 +12051,13 @@ extern "C" {
     /** @brief Creates a memory free node and adds it to a graph
 
  @param [out] pGraphNode      - Pointer to the graph node to create and add to the graph
- @param [in] graph            - Instane of the graph the node to be added
- @param [in] pDependencies    - Const pointer to the node dependenties
+ @param [in] graph            - Instance of the graph node to be added
+ @param [in] pDependencies    - Const pointer to the node dependencies
  @param [in] numDependencies  - The number of dependencies
  @param [in] dev_ptr          - Pointer to the memory to be freed
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphAddMemFreeNode(
         pGraphNode: *mut hipGraphNode_t,
         graph: hipGraph_t,
@@ -11152,11 +12070,11 @@ extern "C" {
     #[must_use]
     /** @brief Returns parameters for memory free node
 
- @param [in] node     - Memory free node for a query
- @param [out] dev_ptr - Device pointer for the specified memory free node
+ @param [in] node     - Memory free node to query
+ @param [out] dev_ptr - Device pointer of the specified memory free node
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphMemFreeNodeGetParams(
         node: hipGraphNode_t,
         dev_ptr: *mut ::core::ffi::c_void,
@@ -11166,12 +12084,12 @@ extern "C" {
     #[must_use]
     /** @brief Get the mem attribute for graphs.
 
- @param [in] device - device the attr is get for.
- @param [in] attr - attr to get.
- @param [out] value - value for specific attr.
+ @param [in] device - Device to get attributes from
+ @param [in] attr - Attribute type to be queried
+ @param [out] value - Value of the queried attribute
  @returns #hipSuccess, #hipErrorInvalidDevice
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipDeviceGetGraphMemAttribute(
         device: ::core::ffi::c_int,
         attr: hipGraphMemAttributeType,
@@ -11182,12 +12100,12 @@ extern "C" {
     #[must_use]
     /** @brief Set the mem attribute for graphs.
 
- @param [in] device - device the attr is set for.
- @param [in] attr - attr to set.
- @param [in] value - value for specific attr.
+ @param [in] device - Device to set attribute of.
+ @param [in] attr - Attribute type to be set.
+ @param [in] value - Value of the attribute.
  @returns #hipSuccess, #hipErrorInvalidDevice
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipDeviceSetGraphMemAttribute(
         device: ::core::ffi::c_int,
         attr: hipGraphMemAttributeType,
@@ -11196,13 +12114,13 @@ extern "C" {
 }
 extern "C" {
     #[must_use]
-    /** @brief Free unused memory on specific device used for graph back to OS.
+    /** @brief Free unused memory reserved for graphs on a specific device and return it back to the OS.
 
- @param [in] device - device the memory is used for graphs
+ @param [in] device - Device for which memory should be trimmed
  @returns #hipSuccess, #hipErrorInvalidDevice
 
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipDeviceGraphMemTrim(device: ::core::ffi::c_int) -> hipError_t;
 }
 extern "C" {
@@ -11215,8 +12133,8 @@ extern "C" {
  @param [in] initialRefcount - reference to resource.
  @param [in] flags - flags passed to API.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipUserObjectCreate(
         object_out: *mut hipUserObject_t,
         ptr: *mut ::core::ffi::c_void,
@@ -11232,8 +12150,8 @@ extern "C" {
  @param [in] object - pointer to instace of userobj.
  @param [in] count - reference to resource to be retained.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipUserObjectRelease(
         object: hipUserObject_t,
         count: ::core::ffi::c_uint,
@@ -11246,8 +12164,8 @@ extern "C" {
  @param [in] object - pointer to instace of userobj.
  @param [in] count - reference to resource to be retained.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipUserObjectRetain(
         object: hipUserObject_t,
         count: ::core::ffi::c_uint,
@@ -11262,8 +12180,8 @@ extern "C" {
  @param [in] count - reference to resource to be retained.
  @param [in] flags - flags passed to API.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphRetainUserObject(
         graph: hipGraph_t,
         object: hipUserObject_t,
@@ -11279,8 +12197,8 @@ extern "C" {
  @param [in] object - pointer to instace of userobj.
  @param [in] count - reference to resource to be retained.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphReleaseUserObject(
         graph: hipGraph_t,
         object: hipUserObject_t,
@@ -11295,8 +12213,8 @@ extern "C" {
  @param [in] path - path to write the DOT file.
  @param [in] flags - Flags from hipGraphDebugDotFlags to get additional node information.
  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorOperatingSystem
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphDebugDotPrint(
         graph: hipGraph_t,
         path: *const ::core::ffi::c_char,
@@ -11315,8 +12233,8 @@ extern "C" {
  For list of attributes see ::hipKernelNodeAttrID.
 
  @returns #hipSuccess, #hipErrorInvalidContext
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphKernelNodeCopyAttributes(
         hSrc: hipGraphNode_t,
         hDst: hipGraphNode_t,
@@ -11342,8 +12260,8 @@ extern "C" {
  @param [in] isEnabled  - Node is enabled if != 0, otherwise the node is disabled.
 
  @returns #hipSuccess, #hipErrorInvalidValue,
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphNodeSetEnabled(
         hGraphExec: hipGraphExec_t,
         hNode: hipGraphNode_t,
@@ -11368,8 +12286,8 @@ extern "C" {
  @param [out] isEnabled  - Location to return the enabled status of the node.
 
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphNodeGetEnabled(
         hGraphExec: hipGraphExec_t,
         hNode: hipGraphNode_t,
@@ -11386,8 +12304,8 @@ extern "C" {
  @param [in] numDependencies - the number of the dependencies.
  @param [in] nodeParams -pointer to the parameters.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphAddExternalSemaphoresWaitNode(
         pGraphNode: *mut hipGraphNode_t,
         graph: hipGraph_t,
@@ -11406,8 +12324,8 @@ extern "C" {
  @param [in] numDependencies - the number of the dependencies.
  @param [in] nodeParams -pointer to the parameters.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphAddExternalSemaphoresSignalNode(
         pGraphNode: *mut hipGraphNode_t,
         graph: hipGraph_t,
@@ -11423,8 +12341,8 @@ extern "C" {
  @param [in]  hNode      - Node from the graph from which graphExec was instantiated.
  @param [in]  nodeParams  - Pointer to the params to be set.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphExternalSemaphoresSignalNodeSetParams(
         hNode: hipGraphNode_t,
         nodeParams: *const hipExternalSemaphoreSignalNodeParams,
@@ -11437,8 +12355,8 @@ extern "C" {
  @param [in]  hNode      - Node from the graph from which graphExec was instantiated.
  @param [in]  nodeParams  - Pointer to the params to be set.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphExternalSemaphoresWaitNodeSetParams(
         hNode: hipGraphNode_t,
         nodeParams: *const hipExternalSemaphoreWaitNodeParams,
@@ -11451,8 +12369,8 @@ extern "C" {
  @param [in]   hNode       - Node from the graph from which graphExec was instantiated.
  @param [out]  params_out  - Pointer to params.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphExternalSemaphoresSignalNodeGetParams(
         hNode: hipGraphNode_t,
         params_out: *mut hipExternalSemaphoreSignalNodeParams,
@@ -11465,8 +12383,8 @@ extern "C" {
  @param [in]   hNode       - Node from the graph from which graphExec was instantiated.
  @param [out]  params_out  - Pointer to params.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphExternalSemaphoresWaitNodeGetParams(
         hNode: hipGraphNode_t,
         params_out: *mut hipExternalSemaphoreWaitNodeParams,
@@ -11480,8 +12398,8 @@ extern "C" {
  @param [in]  hNode      - Node from the graph from which graphExec was instantiated.
  @param [in]  nodeParams  - Pointer to the params to be set.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphExecExternalSemaphoresSignalNodeSetParams(
         hGraphExec: hipGraphExec_t,
         hNode: hipGraphNode_t,
@@ -11496,14 +12414,42 @@ extern "C" {
  @param [in]  hNode      - Node from the graph from which graphExec was instantiated.
  @param [in]  nodeParams  - Pointer to the params to be set.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipGraphExecExternalSemaphoresWaitNodeSetParams(
         hGraphExec: hipGraphExec_t,
         hNode: hipGraphNode_t,
         nodeParams: *const hipExternalSemaphoreWaitNodeParams,
     ) -> hipError_t;
 }
+extern "C" {
+    #[must_use]
+    /** @brief Gets a memcpy node's parameters.
+
+ @param [in] hNode - instance of the node to get parameters from.
+ @param [out] nodeParams - pointer to the parameters.
+ @returns #hipSuccess, #hipErrorInvalidValue
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
+    pub fn hipDrvGraphMemcpyNodeGetParams(
+        hNode: hipGraphNode_t,
+        nodeParams: *mut HIP_MEMCPY3D,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[must_use]
+    /** @brief Sets a memcpy node's parameters.
+
+ @param [in] hNode - instance of the node to Set parameters for.
+ @param [out] nodeParams - pointer to the parameters.
+ @returns #hipSuccess, #hipErrorInvalidValue
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
+    pub fn hipDrvGraphMemcpyNodeSetParams(
+        hNode: hipGraphNode_t,
+        nodeParams: *const HIP_MEMCPY3D,
+    ) -> hipError_t;
+}
 extern "C" {
     #[must_use]
     /** @brief Creates a memset node and adds it to a graph.
@@ -11515,8 +12461,8 @@ extern "C" {
  @param [in] memsetParams - const pointer to the parameters for the memory set.
  @param [in] ctx - cotext related to current device.
  @returns #hipSuccess, #hipErrorInvalidValue
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.*/
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
     pub fn hipDrvGraphAddMemsetNode(
         phGraphNode: *mut hipGraphNode_t,
         hGraph: hipGraph_t,
@@ -11526,6 +12472,62 @@ extern "C" {
         ctx: hipCtx_t,
     ) -> hipError_t;
 }
+extern "C" {
+    #[must_use]
+    /** @brief Creates a memory free node and adds it to a graph
+
+ @param [out] phGraphNode - Pointer to the graph node to create and add to the graph
+ @param [in]  hGraph - Instance of the graph the node to be added
+ @param [in]  dependencies - Const pointer to the node dependencies
+ @param [in]  numDependencies - The number of dependencies
+ @param [in]  dptr - Pointer to the memory to be freed
+ @returns #hipSuccess, #hipErrorInvalidValue
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
+    pub fn hipDrvGraphAddMemFreeNode(
+        phGraphNode: *mut hipGraphNode_t,
+        hGraph: hipGraph_t,
+        dependencies: *const hipGraphNode_t,
+        numDependencies: usize,
+        dptr: hipDeviceptr_t,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[must_use]
+    /** @brief Sets the parameters for a memcpy node in the given graphExec.
+
+ @param [in] hGraphExec - instance of the executable graph with the node.
+ @param [in] hNode - instance of the node to set parameters to.
+ @param [in] copyParams - const pointer to the memcpy node params.
+ @param [in] ctx - cotext related to current device.
+ @returns #hipSuccess, #hipErrorInvalidValue
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
+    pub fn hipDrvGraphExecMemcpyNodeSetParams(
+        hGraphExec: hipGraphExec_t,
+        hNode: hipGraphNode_t,
+        copyParams: *const HIP_MEMCPY3D,
+        ctx: hipCtx_t,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[must_use]
+    /** @brief Sets the parameters for a memset node in the given graphExec.
+
+ @param [in] hGraphExec - instance of the executable graph with the node.
+ @param [in] hNode - instance of the node to set parameters to.
+ @param [in] memsetParams - pointer to the parameters.
+ @param [in] ctx - cotext related to current device.
+ @returns #hipSuccess, #hipErrorInvalidValue
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.*/
+    pub fn hipDrvGraphExecMemsetNodeSetParams(
+        hGraphExec: hipGraphExec_t,
+        hNode: hipGraphNode_t,
+        memsetParams: *const HIP_MEMSET_NODE_PARAMS,
+        ctx: hipCtx_t,
+    ) -> hipError_t;
+}
 extern "C" {
     #[must_use]
     /** @brief Frees an address range reservation made via hipMemAddressReserve
@@ -11533,10 +12535,10 @@ extern "C" {
  @param [in] devPtr - starting address of the range.
  @param [in] size - size of the range.
  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemAddressFree(
         devPtr: *mut ::core::ffi::c_void,
         size: usize,
@@ -11552,10 +12554,10 @@ extern "C" {
  @param [in] addr - requested starting address of the range.
  @param [in] flags - currently unused, must be zero.
  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemAddressReserve(
         ptr: *mut *mut ::core::ffi::c_void,
         size: usize,
@@ -11573,10 +12575,10 @@ extern "C" {
  @param [in] prop - properties of the allocation.
  @param [in] flags - currently unused, must be zero.
  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemCreate(
         handle: *mut hipMemGenericAllocationHandle_t,
         size: usize,
@@ -11593,10 +12595,10 @@ extern "C" {
  @param [in] handleType - type of the shareable handle.
  @param [in] flags - currently unused, must be zero.
  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemExportToShareableHandle(
         shareableHandle: *mut ::core::ffi::c_void,
         handle: hipMemGenericAllocationHandle_t,
@@ -11612,10 +12614,10 @@ extern "C" {
  @param [in] location - target location.
  @param [in] ptr - address to check the access flags.
  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemGetAccess(
         flags: *mut ::core::ffi::c_ulonglong,
         location: *const hipMemLocation,
@@ -11630,10 +12632,10 @@ extern "C" {
  @param [in] prop - location properties.
  @param [in] option - determines which granularity to return.
  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.
 */
     pub fn hipMemGetAllocationGranularity(
         granularity: *mut usize,
@@ -11648,10 +12650,10 @@ extern "C" {
  @param [out] prop - properties of the given handle.
  @param [in] handle - handle to perform the query on.
  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemGetAllocationPropertiesFromHandle(
         prop: *mut hipMemAllocationProp,
         handle: hipMemGenericAllocationHandle_t,
@@ -11665,10 +12667,10 @@ extern "C" {
  @param [in] osHandle - shareable handle representing the memory allocation.
  @param [in] shHandleType - handle type.
  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemImportFromShareableHandle(
         handle: *mut hipMemGenericAllocationHandle_t,
         osHandle: *mut ::core::ffi::c_void,
@@ -11685,10 +12687,10 @@ extern "C" {
  @param [in] handle - memory allocation to be mapped.
  @param [in] flags - currently unused, must be zero.
  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemMap(
         ptr: *mut ::core::ffi::c_void,
         size: usize,
@@ -11705,10 +12707,8 @@ extern "C" {
  @param [in] count - number of hipArrayMapInfo in mapInfoList.
  @param [in] stream - stream identifier for the stream to use for map or unmap operations.
  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
-
- @note  This API is implemented on Linux, under development on Windows.*/
+ @warning This API is under development. Currently it is not supported on AMD
+          GPUs and returns #hipErrorNotSupported.*/
     pub fn hipMemMapArrayAsync(
         mapInfoList: *mut hipArrayMapInfo,
         count: ::core::ffi::c_uint,
@@ -11721,10 +12721,10 @@ extern "C" {
 
  @param [in] handle - handle of the memory allocation.
  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemRelease(handle: hipMemGenericAllocationHandle_t) -> hipError_t;
 }
 extern "C" {
@@ -11734,10 +12734,10 @@ extern "C" {
  @param [out] handle - handle representing addr.
  @param [in] addr - address to look up.
  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemRetainAllocationHandle(
         handle: *mut hipMemGenericAllocationHandle_t,
         addr: *mut ::core::ffi::c_void,
@@ -11752,10 +12752,10 @@ extern "C" {
  @param [in] desc - array of hipMemAccessDesc.
  @param [in] count - number of hipMemAccessDesc in desc.
  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemSetAccess(
         ptr: *mut ::core::ffi::c_void,
         size: usize,
@@ -11770,10 +12770,10 @@ extern "C" {
  @param [in] ptr - starting address of the range to unmap.
  @param [in] size - size of the virtual address range.
  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- @warning : This API is marked as beta, meaning, while this is feature complete,
- it is still open to changes and may have outstanding issues.
+ @warning This API is marked as Beta. While this feature is complete, it can
+          change and might have outstanding issues.
 
- @note  This API is implemented on Linux, under development on Windows.*/
+ @note  This API is implemented on Linux and is under development on Microsoft Windows.*/
     pub fn hipMemUnmap(ptr: *mut ::core::ffi::c_void, size: usize) -> hipError_t;
 }
 extern "C" {
@@ -12434,6 +13434,12 @@ impl hipErrorCode_t {
     pub const GraphExecUpdateFailure: hipErrorCode_t = hipErrorCode_t(unsafe {
         ::core::num::NonZeroU32::new_unchecked(910)
     });
+    pub const InvalidChannelDescriptor: hipErrorCode_t = hipErrorCode_t(unsafe {
+        ::core::num::NonZeroU32::new_unchecked(911)
+    });
+    pub const InvalidTexture: hipErrorCode_t = hipErrorCode_t(unsafe {
+        ::core::num::NonZeroU32::new_unchecked(912)
+    });
     pub const Unknown: hipErrorCode_t = hipErrorCode_t(unsafe {
         ::core::num::NonZeroU32::new_unchecked(999)
     });
@@ -12645,6 +13651,12 @@ pub trait hipError_tConsts {
     const ErrorGraphExecUpdateFailure: hipError_t = hipError_t::Err(
         hipErrorCode_t::GraphExecUpdateFailure,
     );
+    const ErrorInvalidChannelDescriptor: hipError_t = hipError_t::Err(
+        hipErrorCode_t::InvalidChannelDescriptor,
+    );
+    const ErrorInvalidTexture: hipError_t = hipError_t::Err(
+        hipErrorCode_t::InvalidTexture,
+    );
     const ErrorUnknown: hipError_t = hipError_t::Err(hipErrorCode_t::Unknown);
     const ErrorRuntimeMemory: hipError_t = hipError_t::Err(
         hipErrorCode_t::RuntimeMemory,
diff --git a/ptx_parser/src/lib.rs b/ptx_parser/src/lib.rs
index da46a8c..6e191fe 100644
--- a/ptx_parser/src/lib.rs
+++ b/ptx_parser/src/lib.rs
@@ -284,7 +284,7 @@ fn immediate_value<'a, 'input>(stream: &mut PtxParser<'a, 'input>) -> PResult<as
     .parse_next(stream)
 }
 
-pub fn parse_for_errors<'input>(text: &'input str) -> Vec<PtxError> {
+pub fn parse_for_errors<'input>(text: &'input str) -> Vec<PtxError<'input>> {
     let (tokens, mut errors) = lex_with_span_unchecked(text);
     let parse_result = {
         let state = PtxParserState::new(text, &mut errors);
@@ -307,7 +307,7 @@ pub fn parse_for_errors<'input>(text: &'input str) -> Vec<PtxError> {
 
 fn lex_with_span_unchecked<'input>(
     text: &'input str,
-) -> (Vec<(Token<'input>, logos::Span)>, Vec<PtxError>) {
+) -> (Vec<(Token<'input>, logos::Span)>, Vec<PtxError<'input>>) {
     let lexer = Token::lexer(text);
     let mut result = Vec::new();
     let mut errors = Vec::new();
@@ -322,7 +322,7 @@ fn lex_with_span_unchecked<'input>(
 
 pub fn parse_module_checked<'input>(
     text: &'input str,
-) -> Result<ast::Module<'input>, Vec<PtxError>> {
+) -> Result<ast::Module<'input>, Vec<PtxError<'input>>> {
     let mut lexer = Token::lexer(text);
     let mut errors = Vec::new();
     let mut tokens = Vec::new();
@@ -1194,7 +1194,7 @@ impl<Ident> ast::ParsedOperand<Ident> {
     ) -> PResult<ast::ParsedOperand<&'input str>> {
         use winnow::combinator::*;
         use winnow::token::any;
-        fn vector_index<'input>(inp: &'input str) -> Result<u8, PtxError> {
+        fn vector_index<'input>(inp: &'input str) -> Result<u8, PtxError<'input>> {
             match inp {
                 ".x" | ".r" => Ok(0),
                 ".y" | ".g" => Ok(1),
diff --git a/zluda_bindgen/build/cuda_wrapper.h b/zluda_bindgen/build/cuda_wrapper.h
index a550256..2376794 100644
--- a/zluda_bindgen/build/cuda_wrapper.h
+++ b/zluda_bindgen/build/cuda_wrapper.h
@@ -5,3 +5,5 @@
 #include <cudaEGL.h>
 #include <vdpau/vdpau.h>
 #include <cudaVDPAU.h>
+#include <library_types.h>
+#include <cuComplex.h>
diff --git a/zluda_bindgen/build/cudnn_v8/cudnn_adv_infer.h b/zluda_bindgen/build/cudnn_v8/cudnn_adv_infer.h
new file mode 100644
index 0000000..fbd527b
--- /dev/null
+++ b/zluda_bindgen/build/cudnn_v8/cudnn_adv_infer.h
@@ -0,0 +1 @@
+#include <cudnn_adv_infer_v8.h>
diff --git a/zluda_bindgen/build/cudnn_v8/cudnn_adv_train.h b/zluda_bindgen/build/cudnn_v8/cudnn_adv_train.h
new file mode 100644
index 0000000..15c97e7
--- /dev/null
+++ b/zluda_bindgen/build/cudnn_v8/cudnn_adv_train.h
@@ -0,0 +1 @@
+#include <cudnn_adv_train_v8.h>
diff --git a/zluda_bindgen/build/cudnn_v8/cudnn_backend.h b/zluda_bindgen/build/cudnn_v8/cudnn_backend.h
new file mode 100644
index 0000000..8919805
--- /dev/null
+++ b/zluda_bindgen/build/cudnn_v8/cudnn_backend.h
@@ -0,0 +1 @@
+#include <cudnn_backend_v8.h>
diff --git a/zluda_bindgen/build/cudnn_v8/cudnn_cnn_infer.h b/zluda_bindgen/build/cudnn_v8/cudnn_cnn_infer.h
new file mode 100644
index 0000000..4933e9d
--- /dev/null
+++ b/zluda_bindgen/build/cudnn_v8/cudnn_cnn_infer.h
@@ -0,0 +1 @@
+#include <cudnn_cnn_infer_v8.h>
diff --git a/zluda_bindgen/build/cudnn_v8/cudnn_cnn_train.h b/zluda_bindgen/build/cudnn_v8/cudnn_cnn_train.h
new file mode 100644
index 0000000..9921348
--- /dev/null
+++ b/zluda_bindgen/build/cudnn_v8/cudnn_cnn_train.h
@@ -0,0 +1 @@
+#include <cudnn_cnn_train_v8.h>
diff --git a/zluda_bindgen/build/cudnn_v8/cudnn_ops_infer.h b/zluda_bindgen/build/cudnn_v8/cudnn_ops_infer.h
new file mode 100644
index 0000000..e3aa4d2
--- /dev/null
+++ b/zluda_bindgen/build/cudnn_v8/cudnn_ops_infer.h
@@ -0,0 +1 @@
+#include <cudnn_ops_infer_v8.h>
diff --git a/zluda_bindgen/build/cudnn_v8/cudnn_ops_train.h b/zluda_bindgen/build/cudnn_v8/cudnn_ops_train.h
new file mode 100644
index 0000000..b7a6aad
--- /dev/null
+++ b/zluda_bindgen/build/cudnn_v8/cudnn_ops_train.h
@@ -0,0 +1 @@
+#include <cudnn_ops_train_v8.h>
diff --git a/zluda_bindgen/build/cudnn_v8/cudnn_version.h b/zluda_bindgen/build/cudnn_v8/cudnn_version.h
new file mode 100644
index 0000000..8887e0a
--- /dev/null
+++ b/zluda_bindgen/build/cudnn_v8/cudnn_version.h
@@ -0,0 +1 @@
+#include <cudnn_version_v8.h>
diff --git a/zluda_bindgen/build/cufft_wraper.h b/zluda_bindgen/build/cufft_wraper.h
new file mode 100644
index 0000000..06512f5
--- /dev/null
+++ b/zluda_bindgen/build/cufft_wraper.h
@@ -0,0 +1,2 @@
+#include <cufftXt.h>
+#include <cudalibxt.h>
diff --git a/zluda_bindgen/src/main.rs b/zluda_bindgen/src/main.rs
index bfa9d49..ccfa2c5 100644
--- a/zluda_bindgen/src/main.rs
+++ b/zluda_bindgen/src/main.rs
@@ -1,11 +1,13 @@
 use proc_macro2::Span;
 use quote::{format_ident, quote, ToTokens};
 use rustc_hash::{FxHashMap, FxHashSet};
-use std::{collections::hash_map, fs::File, io::Write, iter, path::PathBuf, str::FromStr};
+use std::{
+    borrow::Cow, collections::hash_map, fs::File, io::Write, iter, path::PathBuf, str::FromStr,
+};
 use syn::{
     parse_quote, punctuated::Punctuated, visit_mut::VisitMut, Abi, Fields, FieldsUnnamed, FnArg,
     ForeignItem, ForeignItemFn, Ident, Item, ItemConst, ItemForeignMod, ItemUse, LitStr, Path,
-    PathArguments, Signature, Type, TypePath, UseTree, PathSegment
+    PathArguments, PathSegment, Signature, Type, TypePath, UseTree,
 };
 
 fn main() {
@@ -14,23 +16,511 @@ fn main() {
         &crate_root,
         &["..", "ext", "hip_runtime-sys", "src", "lib.rs"],
     );
-    generate_ml(&crate_root);
     generate_cuda(&crate_root);
+    generate_ml(&crate_root);
+    generate_cublas(&crate_root);
+    generate_cublaslt(&crate_root);
+    generate_cudnn(&crate_root);
+    generate_cufft(&crate_root);
+    generate_cusparse(&crate_root);
+}
+
+fn generate_cufft(crate_root: &PathBuf) {
+    let cufft_header = new_builder()
+        .header_contents("cufft_wraper.h", include_str!("../build/cufft_wraper.h"))
+        .header("/usr/local/cuda/include/cufftXt.h")
+        .allowlist_type("^cufft.*")
+        .allowlist_type("^cudaLibXtDesc.*")
+        .allowlist_type("^cudaXtDesc.*")
+        .allowlist_type("^libFormat.*")
+        .allowlist_function("^cufft.*")
+        .allowlist_var("^CUFFT_.*")
+        .must_use_type("cufftResult_t")
+        .allowlist_recursively(false)
+        .clang_args(["-I/usr/local/cuda/include"])
+        .generate()
+        .unwrap()
+        .to_string();
+    let module: syn::File = syn::parse_str(&cufft_header).unwrap();
+    generate_functions(
+        &crate_root,
+        "cufft",
+        &["..", "cuda_base", "src", "cufft.rs"],
+        &module,
+    );
+    generate_types_library(
+        &crate_root,
+        &["..", "cuda_types", "src", "cufft.rs"],
+        &module,
+    )
+}
+
+fn generate_cusparse(crate_root: &PathBuf) {
+    let cufft_header = new_builder()
+        .header("/usr/local/cuda/include/cusparse_v2.h")
+        .allowlist_type("^cusparse.*")
+        .allowlist_type(".*Info_t$")
+        .allowlist_type(".*Info$")
+        .allowlist_function("^cusparse.*")
+        .allowlist_var("^CUSPARSE_.*")
+        .must_use_type("cusparseStatus_t")
+        .allowlist_recursively(false)
+        .clang_args(["-I/usr/local/cuda/include"])
+        .generate()
+        .unwrap()
+        .to_string();
+    let module: syn::File = syn::parse_str(&cufft_header).unwrap();
+    generate_functions(
+        &crate_root,
+        "cusparse",
+        &["..", "cuda_base", "src", "cusparse.rs"],
+        &module,
+    );
+    generate_types_library(
+        &crate_root,
+        &["..", "cuda_types", "src", "cusparse.rs"],
+        &module,
+    )
+}
+
+fn generate_cudnn(crate_root: &PathBuf) {
+    let cudnn9 = new_builder()
+        .header("/usr/include/x86_64-linux-gnu/cudnn_v9.h")
+        .allowlist_type("^cudnn.*")
+        .allowlist_function("^cudnn.*")
+        .allowlist_var("^CUDNN_.*")
+        .must_use_type("cudnnStatus_t")
+        .allowlist_recursively(false)
+        .clang_args(["-I/usr/local/cuda/include"])
+        .generate()
+        .unwrap()
+        .to_string();
+    let cudnn9_module: syn::File = syn::parse_str(&cudnn9).unwrap();
+    let cudnn9_types = generate_types_library_impl(&cudnn9_module);
+    let mut current_dir = PathBuf::from(file!());
+    current_dir.pop();
+    let cudnn8 = new_builder()
+        .header("/usr/include/x86_64-linux-gnu/cudnn_v8.h")
+        .allowlist_type("^cudnn.*")
+        .allowlist_function("^cudnn.*")
+        .allowlist_var("^CUDNN_.*")
+        .must_use_type("cudnnStatus_t")
+        .allowlist_recursively(false)
+        .clang_args([
+            "-I/usr/local/cuda/include",
+            &format!("-I{}/../build/cudnn_v8", current_dir.display()),
+        ])
+        .generate()
+        .unwrap()
+        .to_string();
+    let cudnn8_module: syn::File = syn::parse_str(&cudnn8).unwrap();
+    let cudnn8_types = generate_types_library_impl(&cudnn8_module);
+    merge_types(
+        &crate_root,
+        &["..", "cuda_types", "src", "cudnn.rs"],
+        cudnn9_types,
+        &["..", "cuda_types", "src", "cudnn9.rs"],
+        cudnn8_types,
+        &["..", "cuda_types", "src", "cudnn8.rs"],
+    );
+    generate_functions(
+        &crate_root,
+        "cudnn8",
+        &["..", "cuda_base", "src", "cudnn8.rs"],
+        &cudnn8_module,
+    );
+    generate_functions(
+        &crate_root,
+        "cudnn9",
+        &["..", "cuda_base", "src", "cudnn9.rs"],
+        &cudnn9_module,
+    );
+}
+
+// This code splits types (and constants) into one of:
+// - cudnn8-specific
+// - cudnn9-specific
+// - cudnn shared
+// With the rules being:
+// - constants go to the version-specific files
+// - if there's conflict between types they go to version-specific files
+// - if the cudnn9 type is purely additive over cudnn8 then it goes into the
+//   shared (and is re-exported by both)
+fn merge_types(
+    output: &PathBuf,
+    cudnn_path: &[&str],
+    cudnn9_types: syn::File,
+    cudnn9_path: &[&str],
+    cudnn8_types: syn::File,
+    cudnn8_path: &[&str],
+) {
+    let cudnn_enums = merge_enums(&cudnn9_types, &cudnn8_types);
+    let conflicting_types = get_conflicting_structs(&cudnn9_types, &cudnn8_types, cudnn_enums);
+    write_common_cudnn_types(output, cudnn_path, &cudnn9_types, &conflicting_types);
+    write_cudnn8_types(output, cudnn8_path, &cudnn8_types, &conflicting_types);
+    write_cudnn9_types(output, cudnn9_path, &cudnn9_types, &conflicting_types);
+}
+
+fn write_cudnn9_types(
+    output: &PathBuf,
+    cudnn9_path: &[&str],
+    cudnn9_types: &syn::File,
+    conflicting_types: &FxHashMap<&Ident, CudnnEnumMergeResult>,
+) {
+    let items = cudnn9_types.items.iter().filter_map(|item| match item {
+        Item::Impl(impl_) => match conflicting_types.get(type_to_ident(&*impl_.self_ty)) {
+            Some(CudnnEnumMergeResult::Conflict) | Some(CudnnEnumMergeResult::Cudnn9) | None => {
+                Option::<syn::Item>::Some(parse_quote!( #impl_))
+            }
+            Some(CudnnEnumMergeResult::Same) => None,
+        },
+        Item::Struct(struct_) => match conflicting_types.get(&struct_.ident) {
+            Some(CudnnEnumMergeResult::Conflict) | Some(CudnnEnumMergeResult::Cudnn9) | None => {
+                Some(parse_quote!( #struct_))
+            }
+            Some(CudnnEnumMergeResult::Same) => {
+                let type_ = &struct_.ident;
+                Some(parse_quote!( pub use super::cudnn:: #type_; ))
+            }
+        },
+        Item::Enum(enum_) => match conflicting_types.get(&enum_.ident) {
+            Some(CudnnEnumMergeResult::Conflict) | Some(CudnnEnumMergeResult::Cudnn9) | None => {
+                Some(parse_quote!( #enum_))
+            }
+            Some(CudnnEnumMergeResult::Same) => {
+                let type_ = &enum_.ident;
+                Some(parse_quote!( pub use super::cudnn:: #type_; ))
+            }
+        },
+        Item::ForeignMod(ItemForeignMod { .. }) => None,
+        Item::Const(const_) => Some(parse_quote!(#const_)),
+        Item::Union(union_) => match conflicting_types.get(&union_.ident) {
+            Some(CudnnEnumMergeResult::Conflict) | Some(CudnnEnumMergeResult::Cudnn9) | None => {
+                Some(parse_quote!( #union_))
+            }
+            Some(CudnnEnumMergeResult::Same) => {
+                let type_ = &union_.ident;
+                Some(parse_quote!( pub use super::cudnn:: #type_; ))
+            }
+        },
+        Item::Use(use_) => Some(parse_quote!(#use_)),
+        Item::Type(type_) => Some(parse_quote!(#type_)),
+        _ => unimplemented!(),
+    });
+    let module: syn::File = parse_quote! {
+        #(#items)*
+    };
+    let mut output = output.clone();
+    output.extend(cudnn9_path);
+    let text = prettyplease::unparse(&module);
+    write_rust_to_file(output, &text)
+}
+
+fn write_cudnn8_types(
+    output: &PathBuf,
+    cudnn8_path: &[&str],
+    cudnn8_types: &syn::File,
+    conflicting_types: &FxHashMap<&Ident, CudnnEnumMergeResult>,
+) {
+    let items = cudnn8_types.items.iter().filter_map(|item| match item {
+        Item::Impl(impl_) => match conflicting_types.get(type_to_ident(&*impl_.self_ty)) {
+            Some(CudnnEnumMergeResult::Conflict) | None => {
+                Option::<syn::Item>::Some(parse_quote!( #impl_))
+            }
+            Some(CudnnEnumMergeResult::Same) => None,
+            Some(CudnnEnumMergeResult::Cudnn9) => None,
+        },
+        Item::Struct(struct_) => match conflicting_types.get(&struct_.ident) {
+            Some(CudnnEnumMergeResult::Conflict) | None => Some(parse_quote!( #struct_)),
+            Some(CudnnEnumMergeResult::Same) => {
+                let type_ = &struct_.ident;
+                Some(parse_quote!( pub use super::cudnn:: #type_; ))
+            }
+            Some(CudnnEnumMergeResult::Cudnn9) => {
+                let type_ = &struct_.ident;
+                Some(parse_quote!( pub use super::cudnn9:: #type_; ))
+            }
+        },
+        Item::Enum(enum_) => match conflicting_types.get(&enum_.ident) {
+            Some(CudnnEnumMergeResult::Conflict) | None => Some(parse_quote!( #enum_)),
+            Some(CudnnEnumMergeResult::Same) => {
+                let type_ = &enum_.ident;
+                Some(parse_quote!( pub use super::cudnn:: #type_; ))
+            }
+            Some(CudnnEnumMergeResult::Cudnn9) => {
+                let type_ = &enum_.ident;
+                Some(parse_quote!( pub use super::cudnn9:: #type_; ))
+            }
+        },
+        Item::ForeignMod(ItemForeignMod { .. }) => None,
+        Item::Const(const_) => Some(parse_quote!(#const_)),
+        Item::Union(union_) => match conflicting_types.get(&union_.ident) {
+            Some(CudnnEnumMergeResult::Conflict) | None => Some(parse_quote!( #union_)),
+            Some(CudnnEnumMergeResult::Same) => {
+                let type_ = &union_.ident;
+                Some(parse_quote!( pub use super::cudnn:: #type_; ))
+            }
+            Some(CudnnEnumMergeResult::Cudnn9) => {
+                let type_ = &union_.ident;
+                Some(parse_quote!( pub use super::cudnn9:: #type_; ))
+            }
+        },
+        Item::Use(use_) => Some(parse_quote!(#use_)),
+        Item::Type(type_) => Some(parse_quote!(#type_)),
+        _ => unimplemented!(),
+    });
+    let module: syn::File = parse_quote! {
+        #(#items)*
+    };
+    let mut output = output.clone();
+    output.extend(cudnn8_path);
+    let text = prettyplease::unparse(&module);
+    write_rust_to_file(output, &text)
+}
+
+fn write_common_cudnn_types(
+    output: &PathBuf,
+    cudnn_path: &[&str],
+    cudnn9_types: &syn::File,
+    conflicting_types: &FxHashMap<&Ident, CudnnEnumMergeResult>,
+) {
+    let common_items = cudnn9_types.items.iter().filter_map(|item| match item {
+        Item::Impl(ref impl_) => match conflicting_types.get(type_to_ident(&*impl_.self_ty)) {
+            Some(CudnnEnumMergeResult::Conflict) => None,
+            Some(CudnnEnumMergeResult::Same) => {
+                let item: Item = parse_quote! {
+                    #impl_
+                };
+                Some(item)
+            }
+            Some(CudnnEnumMergeResult::Cudnn9) => None,
+            None => None,
+        },
+        Item::Struct(ref struct_) => match conflicting_types.get(&struct_.ident) {
+            Some(CudnnEnumMergeResult::Conflict) => None,
+            Some(CudnnEnumMergeResult::Same) => {
+                let item: Item = parse_quote! {
+                    #struct_
+                };
+                Some(item)
+            }
+            Some(CudnnEnumMergeResult::Cudnn9) => None,
+            None => None,
+        },
+        Item::Enum(ref enum_) => match conflicting_types.get(&enum_.ident) {
+            Some(CudnnEnumMergeResult::Conflict) => None,
+            Some(CudnnEnumMergeResult::Same) => {
+                let item: Item = parse_quote! {
+                    #enum_
+                };
+                Some(item)
+            }
+            Some(CudnnEnumMergeResult::Cudnn9) => None,
+            None => None,
+        },
+        Item::ForeignMod(ItemForeignMod { .. }) => None,
+        _ => None,
+        //_ => unimplemented!(),
+    });
+    let cudnn_common: syn::File = parse_quote! {
+        #(#common_items)*
+    };
+    let mut output = output.clone();
+    output.extend(cudnn_path);
+    let text = prettyplease::unparse(&cudnn_common);
+    write_rust_to_file(output, &text)
+}
+
+fn get_conflicting_structs<'a>(
+    cudnn9_types: &'a syn::File,
+    cudnn8_types: &'a syn::File,
+    mut enums: FxHashMap<&'a Ident, CudnnEnumMergeResult>,
+) -> FxHashMap<&'a Ident, CudnnEnumMergeResult> {
+    let structs9 = get_structs(cudnn9_types);
+    let structs8 = get_structs(cudnn8_types);
+    for (struct_name8, struct8) in structs8 {
+        if enums.contains_key(struct_name8) {
+            continue;
+        }
+        match structs9.get(struct_name8) {
+            Some(struct9) => {
+                if struct8 != *struct9 {
+                    panic!("{}", struct_name8.to_string());
+                }
+                let has_conflicting_field = struct8.iter().any(|field| {
+                    let type_ = type_to_ident(&field.ty);
+                    enums.get(type_) == Some(&CudnnEnumMergeResult::Conflict)
+                });
+                let value = if has_conflicting_field {
+                    CudnnEnumMergeResult::Conflict
+                } else {
+                    CudnnEnumMergeResult::Same
+                };
+                assert!(enums.insert(struct_name8, value).is_none());
+            }
+            None => {}
+        }
+    }
+    enums
+}
+
+fn type_to_ident<'a>(ty: &'a syn::Type) -> &'a syn::Ident {
+    match ty {
+        Type::Path(path) => &path.path.segments[0].ident,
+        Type::Array(array) => type_to_ident(&array.elem),
+        _ => unimplemented!("{}", ty.to_token_stream().to_string()),
+    }
+}
+
+fn merge_enums<'a>(
+    cudnn9_types: &'a syn::File,
+    cudnn8_types: &'a syn::File,
+) -> FxHashMap<&'a Ident, CudnnEnumMergeResult> {
+    let result = {
+        let enums8 = get_enums(cudnn8_types);
+        let enums9 = get_enums(cudnn9_types);
+        enums8
+            .iter()
+            .map(|(enum8_ident, enum8_vars)| {
+                let merge_result = match enums9.get(enum8_ident) {
+                    Some(enum9_vars) => {
+                        let e8_has_extra = enum8_vars.difference(&enum9_vars).any(|_| true);
+                        let e9_has_extra = enum9_vars.difference(&enum8_vars).any(|_| true);
+                        match (e8_has_extra, e9_has_extra) {
+                            (false, false) => CudnnEnumMergeResult::Same,
+                            (false, true) => CudnnEnumMergeResult::Cudnn9,
+                            (true, true) => CudnnEnumMergeResult::Conflict,
+                            (true, false) => unimplemented!(),
+                        }
+                    }
+                    None => {
+                        unimplemented!()
+                    }
+                };
+                (*enum8_ident, merge_result)
+            })
+            .collect::<FxHashMap<_, _>>()
+    };
+    result
+}
+
+#[derive(Copy, Clone, PartialEq, Eq)]
+enum CudnnEnumMergeResult {
+    // Conflicting definitions
+    Conflict,
+    // Identical definitions
+    Same,
+    // Enum present in both, but cudnn9 definition is a strict superset
+    Cudnn9,
+}
+
+fn get_enums<'a>(
+    cudnn_module: &'a syn::File,
+) -> FxHashMap<&'a Ident, FxHashSet<&'a syn::ImplItemConst>> {
+    let mut enums = FxHashMap::default();
+    for item in cudnn_module.items.iter() {
+        match item {
+            Item::Impl(ref impl_) => match &*impl_.self_ty {
+                Type::Path(path) => {
+                    let constant = match impl_.items[0] {
+                        syn::ImplItem::Const(ref impl_item_const) => impl_item_const,
+                        _ => unimplemented!(),
+                    };
+                    enums
+                        .entry(&path.path.segments[0].ident)
+                        .or_insert(FxHashSet::default())
+                        .insert(constant);
+                }
+                _ => unimplemented!(),
+            },
+            _ => {}
+        }
+    }
+    enums
+}
+
+fn get_structs<'a>(cudnn_module: &'a syn::File) -> FxHashMap<&'a Ident, Cow<'a, syn::Fields>> {
+    let mut structs = FxHashMap::default();
+    for item in cudnn_module.items.iter() {
+        match item {
+            Item::Struct(ref struct_) => {
+                assert!(structs
+                    .insert(&struct_.ident, Cow::Borrowed(&struct_.fields))
+                    .is_none());
+            }
+            Item::Union(ref union_) => {
+                assert!(structs
+                    .insert(
+                        &union_.ident,
+                        Cow::Owned(syn::Fields::Named(union_.fields.clone()))
+                    )
+                    .is_none());
+            }
+            _ => {}
+        }
+    }
+    structs
+}
+
+fn generate_cublas(crate_root: &PathBuf) {
+    let cublas_header = new_builder()
+        .header("/usr/local/cuda/include/cublas_v2.h")
+        .allowlist_type("^cublas.*")
+        .allowlist_function("^cublas.*")
+        .allowlist_var("^CUBLAS_.*")
+        .must_use_type("cublasStatus_t")
+        .allowlist_recursively(false)
+        .clang_args(["-I/usr/local/cuda/include", "-x", "c++"])
+        .generate()
+        .unwrap()
+        .to_string();
+    let module: syn::File = syn::parse_str(&cublas_header).unwrap();
+    generate_functions(
+        &crate_root,
+        "cublas",
+        &["..", "cuda_base", "src", "cublas.rs"],
+        &module,
+    );
+    generate_types_library(
+        &crate_root,
+        &["..", "cuda_types", "src", "cublas.rs"],
+        &module,
+    )
+}
+
+fn generate_cublaslt(crate_root: &PathBuf) {
+    let cublas_header = new_builder()
+        .header("/usr/local/cuda/include/cublasLt.h")
+        .allowlist_type("^cublas.*")
+        .allowlist_function("^cublasLt.*")
+        .allowlist_var("^CUBLASLT_.*")
+        .must_use_type("cublasStatus_t")
+        .allowlist_recursively(false)
+        .clang_args(["-I/usr/local/cuda/include", "-x", "c++"])
+        .generate()
+        .unwrap()
+        .to_string();
+    let module: syn::File = syn::parse_str(&cublas_header).unwrap();
+    generate_functions(
+        &crate_root,
+        "cublaslt",
+        &["..", "cuda_base", "src", "cublaslt.rs"],
+        &module,
+    );
+    generate_types_library(
+        &crate_root,
+        &["..", "cuda_types", "src", "cublaslt.rs"],
+        &module,
+    )
 }
 
 fn generate_cuda(crate_root: &PathBuf) {
-    let cuda_header = bindgen::Builder::default()
-        .use_core()
-        .rust_target(bindgen::RustTarget::Stable_1_77)
-        .layout_tests(false)
-        .default_enum_style(bindgen::EnumVariation::NewType {
-            is_bitfield: false,
-            is_global: false,
-        })
-        .derive_hash(true)
-        .derive_eq(true)
+    let cuda_header = new_builder()
         .header_contents("cuda_wrapper.h", include_str!("../build/cuda_wrapper.h"))
         .allowlist_type("^CU.*")
+        .allowlist_type("^cuda.*")
+        .allowlist_type("^cu.*Complex.*")
+        .allowlist_type("^libraryPropertyType.*")
         .allowlist_function("^cu.*")
         .allowlist_var("^CU.*")
         .must_use_type("cudaError_enum")
@@ -67,22 +557,14 @@ fn generate_cuda(crate_root: &PathBuf) {
 }
 
 fn generate_ml(crate_root: &PathBuf) {
-    let ml_header = bindgen::Builder::default()
-        .use_core()
-        .rust_target(bindgen::RustTarget::Stable_1_77)
-        .layout_tests(false)
-        .default_enum_style(bindgen::EnumVariation::NewType {
-            is_bitfield: false,
-            is_global: false,
-        })
-        .derive_hash(true)
-        .derive_eq(true)
+    let ml_header = new_builder()
         .header("/usr/local/cuda/include/nvml.h")
         .allowlist_type("^nvml.*")
         .allowlist_function("^nvml.*")
         .allowlist_var("^NVML.*")
         .must_use_type("nvmlReturn_t")
         .constified_enum("nvmlReturn_enum")
+        .clang_args(["-I/usr/local/cuda/include"])
         .generate()
         .unwrap()
         .to_string();
@@ -112,37 +594,51 @@ fn generate_ml(crate_root: &PathBuf) {
         &["..", "cuda_base", "src", "nvml.rs"],
         &module,
     );
-    generate_types(
+    generate_types_library(
         &crate_root,
         &["..", "cuda_types", "src", "nvml.rs"],
         &module,
     );
 }
 
-fn generate_types(crate_root: &PathBuf, path: &[&str], module: &syn::File) {
+fn generate_types_library(crate_root: &PathBuf, path: &[&str], module: &syn::File) {
+    let module = generate_types_library_impl(module);
+    let mut output = crate_root.clone();
+    output.extend(path);
+    let text = prettyplease::unparse(&module)
+        .replace("self::cudaDataType", "super::cuda::cudaDataType")
+        // complex as used by cuFFT
+        .replace(" cuComplex", " super::cuda::cuComplex")
+        .replace(" cuDoubleComplex", " super::cuda::cuDoubleComplex");
+    write_rust_to_file(output, &text)
+}
+
+fn generate_types_library_impl(module: &syn::File) -> syn::File {
+    let known_reexports: Punctuated<syn::Item, syn::parse::Nothing> = parse_quote! {
+        pub type __half = u16;
+        pub type __nv_bfloat16 = u16;
+        pub use super::cuda::cuComplex;
+        pub use super::cuda::cuDoubleComplex;
+        pub use super::cuda::cudaDataType;
+        pub use super::cuda::cudaDataType_t;
+        pub type cudaStream_t = super::cuda::CUstream;
+        pub use super::cuda::libraryPropertyType;
+        pub type cudaGraphExecUpdateResultInfo_st = super::cuda::CUgraphExecUpdateResultInfo_st;
+        pub type cudaAsyncNotificationType = super::cuda::CUasyncNotificationType_enum;
+        pub type cudaGraph_t = super::cuda::CUgraph;
+    };
     let non_fn = module.items.iter().filter_map(|item| match item {
         Item::ForeignMod(_) => None,
         _ => Some(item),
     });
-    let module: syn::File = parse_quote! {
-            #(#non_fn)*
-    };
-    let mut output = crate_root.clone();
-    output.extend(path);
-    write_rust_to_file(output, &prettyplease::unparse(&module))
+    let items = known_reexports.iter().chain(non_fn);
+    parse_quote! {
+        #(#items)*
+    }
 }
 
 fn generate_hip_runtime(output: &PathBuf, path: &[&str]) {
-    let hiprt_header = bindgen::Builder::default()
-        .use_core()
-        .rust_target(bindgen::RustTarget::Stable_1_77)
-        .layout_tests(false)
-        .default_enum_style(bindgen::EnumVariation::NewType {
-            is_bitfield: false,
-            is_global: false,
-        })
-        .derive_hash(true)
-        .derive_eq(true)
+    let hiprt_header = new_builder()
         .header("/opt/rocm/include/hip/hip_runtime_api.h")
         .allowlist_type("^hip.*")
         .allowlist_function("^hip.*")
@@ -403,7 +899,7 @@ impl VisitMut for PrependCudaPath {
     fn visit_type_path_mut(&mut self, type_: &mut TypePath) {
         if type_.path.segments.len() == 1 {
             match &*type_.path.segments[0].ident.to_string() {
-                "usize" | "f64" | "f32" => {}
+                "usize" | "u32" | "i32" | "u64" | "i64" | "f64" | "f32" | "FILE" => {}
                 _ => {
                     let module = &self.module;
                     *type_ = parse_quote! { cuda_types :: #module :: #type_ };
@@ -426,7 +922,7 @@ struct ExplicitReturnType;
 impl VisitMut for ExplicitReturnType {
     fn visit_return_type_mut(&mut self, i: &mut syn::ReturnType) {
         if let syn::ReturnType::Default = i {
-            *i = parse_quote! { -> {} };
+            *i = parse_quote! { -> () };
         }
     }
 }
@@ -459,6 +955,7 @@ fn generate_display(
         "CUdevResource_st",
         "CUlaunchAttribute_st",
         "CUlaunchConfig_st",
+        "CUmemcpy3DOperand_st",
     ];
     let ignore_functions = [
         "cuGLGetDevices",
@@ -563,7 +1060,7 @@ fn cuda_derive_display_trait_for_item<'a>(
     state: &mut DeriveDisplayState<'a>,
     item: &'a Item,
 ) -> Option<syn::Item> {
-    let path_prefix = & state.types_crate;
+    let path_prefix = &state.types_crate;
     let path_prefix_iter = iter::repeat(&path_prefix);
     let mut prepend_path = PrependCudaPath {
         module: Ident::new("cuda", Span::call_site()),
@@ -798,3 +1295,16 @@ fn curesult_display_trait(derive_state: &DeriveDisplayState) -> syn::Item {
         }
     }
 }
+
+fn new_builder() -> bindgen::Builder {
+    bindgen::Builder::default()
+        .use_core()
+        .rust_target(bindgen::RustTarget::Stable_1_77)
+        .layout_tests(false)
+        .default_enum_style(bindgen::EnumVariation::NewType {
+            is_bitfield: false,
+            is_global: false,
+        })
+        .derive_hash(true)
+        .derive_eq(true)
+}
diff --git a/zluda_blas/Cargo.toml b/zluda_blas/Cargo.toml
new file mode 100644
index 0000000..d6fe818
--- /dev/null
+++ b/zluda_blas/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "zluda_blas"
+version = "0.0.0"
+edition = "2021"
+
+[lib]
+crate-type = ["cdylib"]
+name = "cublas"
+
+[dependencies]
+cuda_base = { path = "../cuda_base" }
+cuda_types = { path = "../cuda_types" }
+
+[package.metadata.zluda]
+linux_symlinks = [
+    "libcublas.so.12",
+]
diff --git a/zluda_blas/src/impl.rs b/zluda_blas/src/impl.rs
new file mode 100644
index 0000000..692b504
--- /dev/null
+++ b/zluda_blas/src/impl.rs
@@ -0,0 +1,32 @@
+use cuda_types::cublas::cublasStatus_t;
+
+#[cfg(debug_assertions)]
+pub(crate) fn unimplemented() -> cublasStatus_t {
+    unimplemented!()
+}
+
+#[cfg(not(debug_assertions))]
+pub(crate) fn unimplemented() -> cublasStatus_t {
+    cublasStatus_t::CUBLAS_STATUS_NOT_SUPPORTED
+}
+
+#[allow(non_snake_case)]
+pub fn cublasGetStatusName(_status: cuda_types::cublas::cublasStatus_t) -> *const ::core::ffi::c_char {
+    todo!()
+}
+
+#[allow(non_snake_case)]
+pub fn cublasGetStatusString(_status: cuda_types::cublas::cublasStatus_t) -> *const ::core::ffi::c_char {
+    todo!()
+}
+
+#[allow(non_snake_case)]
+pub fn cublasXerbla(_srName: *const ::core::ffi::c_char, _info: ::core::ffi::c_int) -> () {
+    todo!()
+}
+
+
+#[allow(non_snake_case)]
+pub fn cublasGetCudartVersion() -> usize {
+    todo!()
+}
diff --git a/zluda_blas/src/lib.rs b/zluda_blas/src/lib.rs
new file mode 100644
index 0000000..26c3307
--- /dev/null
+++ b/zluda_blas/src/lib.rs
@@ -0,0 +1,37 @@
+mod r#impl;
+
+macro_rules! unimplemented {
+    ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:ty;)*) => {
+        $(
+            #[cfg_attr(not(test), no_mangle)]
+            #[allow(improper_ctypes)]
+            #[allow(improper_ctypes_definitions)]
+            pub unsafe extern $abi fn $fn_name ( $( $arg_id : $arg_type),* ) -> $ret_type {
+                crate::r#impl::unimplemented()
+            }
+        )*
+    };
+}
+
+macro_rules! implemented {
+    ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:ty;)*) => {
+        $(
+            #[cfg_attr(not(test), no_mangle)]
+            #[allow(improper_ctypes)]
+            #[allow(improper_ctypes_definitions)]
+            pub unsafe extern $abi fn $fn_name ( $( $arg_id : $arg_type),* ) -> $ret_type {
+                crate::r#impl::$fn_name( $( $arg_id ),* )
+            }
+        )*
+    };
+}
+
+cuda_base::cublas_function_declarations!(
+    unimplemented,
+    implemented <= [
+        cublasGetStatusName,
+        cublasGetStatusString,
+        cublasXerbla,
+        cublasGetCudartVersion
+    ]
+);
diff --git a/zluda_blaslt/Cargo.toml b/zluda_blaslt/Cargo.toml
new file mode 100644
index 0000000..635beb1
--- /dev/null
+++ b/zluda_blaslt/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "zluda_blaslt"
+version = "0.0.0"
+edition = "2021"
+
+[lib]
+crate-type = ["cdylib"]
+name = "cublaslt"
+
+[dependencies]
+cuda_base = { path = "../cuda_base" }
+cuda_types = { path = "../cuda_types" }
+
+[package.metadata.zluda]
+linux_symlinks = [
+    "libcublasLt.so",
+    "libcublasLt.so.12",
+]
diff --git a/zluda_blaslt/src/impl.rs b/zluda_blaslt/src/impl.rs
new file mode 100644
index 0000000..2d6d20b
--- /dev/null
+++ b/zluda_blaslt/src/impl.rs
@@ -0,0 +1,42 @@
+use cuda_types::cublaslt::cublasStatus_t;
+
+#[cfg(debug_assertions)]
+pub(crate) fn unimplemented() -> cublasStatus_t {
+    unimplemented!()
+}
+
+#[cfg(not(debug_assertions))]
+pub(crate) fn unimplemented() -> cublasStatus_t {
+    cublasStatus_t::CUBLAS_STATUS_NOT_SUPPORTED
+}
+
+#[allow(non_snake_case)]
+pub(crate) fn cublasLtGetStatusName(
+    _status: cuda_types::cublaslt::cublasStatus_t,
+) -> *const ::core::ffi::c_char {
+    todo!()
+}
+
+#[allow(non_snake_case)]
+pub(crate) fn cublasLtGetStatusString(
+    _status: cuda_types::cublaslt::cublasStatus_t,
+) -> *const ::core::ffi::c_char {
+    todo!()
+}
+
+#[allow(non_snake_case)]
+pub(crate) fn cublasLtGetVersion() -> usize {
+    todo!()
+}
+
+#[allow(non_snake_case)]
+pub(crate) fn cublasLtGetCudartVersion() -> usize {
+    todo!()
+}
+
+#[allow(non_snake_case)]
+pub(crate) fn cublasLtDisableCpuInstructionsSetMask(
+    _mask: ::core::ffi::c_uint,
+) -> ::core::ffi::c_uint {
+    todo!()
+}
diff --git a/zluda_blaslt/src/lib.rs b/zluda_blaslt/src/lib.rs
new file mode 100644
index 0000000..005f97f
--- /dev/null
+++ b/zluda_blaslt/src/lib.rs
@@ -0,0 +1,40 @@
+mod r#impl;
+
+pub enum FILE { }
+
+macro_rules! unimplemented {
+    ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:ty;)*) => {
+        $(
+            #[cfg_attr(not(test), no_mangle)]
+            #[allow(improper_ctypes)]
+            #[allow(improper_ctypes_definitions)]
+            pub unsafe extern $abi fn $fn_name ( $( $arg_id : $arg_type),* ) -> $ret_type {
+                crate::r#impl::unimplemented()
+            }
+        )*
+    };
+}
+
+macro_rules! implemented {
+    ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:ty;)*) => {
+        $(
+            #[cfg_attr(not(test), no_mangle)]
+            #[allow(improper_ctypes)]
+            #[allow(improper_ctypes_definitions)]
+            pub unsafe extern $abi fn $fn_name ( $( $arg_id : $arg_type),* ) -> $ret_type {
+                crate::r#impl::$fn_name( $( $arg_id ),* )
+            }
+        )*
+    };
+}
+
+cuda_base::cublaslt_function_declarations!(
+    unimplemented,
+    implemented <= [
+        cublasLtGetStatusName,
+        cublasLtGetStatusString,
+        cublasLtDisableCpuInstructionsSetMask,
+        cublasLtGetVersion,
+        cublasLtGetCudartVersion
+    ]
+);
diff --git a/zluda_dnn/Cargo.toml b/zluda_dnn/Cargo.toml
new file mode 100644
index 0000000..71245fa
--- /dev/null
+++ b/zluda_dnn/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "zluda_dnn"
+version = "0.0.0"
+edition = "2021"
+
+[lib]
+crate-type = ["cdylib"]
+name = "cudnn64_9"
+
+[dependencies]
+cuda_base = { path = "../cuda_base" }
+cuda_types = { path = "../cuda_types" }
+
+[package.metadata.zluda]
+linux_symlinks = [
+    "libcudnn.so",
+    "libcudnn.so.9",
+]
diff --git a/zluda_dnn/src/impl.rs b/zluda_dnn/src/impl.rs
new file mode 100644
index 0000000..1357224
--- /dev/null
+++ b/zluda_dnn/src/impl.rs
@@ -0,0 +1,34 @@
+use cuda_types::cudnn9::cudnnStatus_t;
+
+#[cfg(debug_assertions)]
+pub(crate) fn unimplemented() -> cudnnStatus_t {
+    unimplemented!()
+}
+
+#[cfg(not(debug_assertions))]
+pub(crate) fn unimplemented() -> cudnnStatus_t {
+    cudnnStatus_t::CUDNN_STATUS_NOT_SUPPORTED
+}
+
+#[allow(non_snake_case)]
+pub(crate) fn cudnnGetVersion() -> usize {
+    todo!()
+}
+#[allow(non_snake_case)]
+pub(crate) fn cudnnGetMaxDeviceVersion() -> usize {
+    todo!()
+}
+#[allow(non_snake_case)]
+pub(crate) fn cudnnGetCudartVersion() -> usize {
+    todo!()
+}
+#[allow(non_snake_case)]
+pub(crate) fn cudnnGetErrorString(
+    _status: cuda_types::cudnn9::cudnnStatus_t,
+) -> *const ::core::ffi::c_char {
+    todo!()
+}
+#[allow(non_snake_case)]
+pub(crate) fn cudnnGetLastErrorString(_message: *mut ::core::ffi::c_char, _max_size: usize) -> () {
+    todo!()
+}
diff --git a/zluda_dnn/src/lib.rs b/zluda_dnn/src/lib.rs
new file mode 100644
index 0000000..1df8af6
--- /dev/null
+++ b/zluda_dnn/src/lib.rs
@@ -0,0 +1,38 @@
+mod r#impl;
+
+macro_rules! unimplemented {
+    ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:ty;)*) => {
+        $(
+            #[cfg_attr(not(test), no_mangle)]
+            #[allow(improper_ctypes)]
+            #[allow(improper_ctypes_definitions)]
+            pub unsafe extern $abi fn $fn_name ( $( $arg_id : $arg_type),* ) -> $ret_type {
+                crate::r#impl::unimplemented()
+            }
+        )*
+    };
+}
+
+macro_rules! implemented {
+    ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:ty;)*) => {
+        $(
+            #[cfg_attr(not(test), no_mangle)]
+            #[allow(improper_ctypes)]
+            #[allow(improper_ctypes_definitions)]
+            pub unsafe extern $abi fn $fn_name ( $( $arg_id : $arg_type),* ) -> $ret_type {
+                crate::r#impl::$fn_name( $( $arg_id ),* )
+            }
+        )*
+    };
+}
+
+cuda_base::cudnn9_function_declarations!(
+    unimplemented,
+    implemented <= [
+        cudnnGetVersion,
+        cudnnGetMaxDeviceVersion,
+        cudnnGetCudartVersion,
+        cudnnGetErrorString,
+        cudnnGetLastErrorString
+    ]
+);
diff --git a/zluda_dump/src/format.rs b/zluda_dump/src/format.rs
index 776c493..e05b65f 100644
--- a/zluda_dump/src/format.rs
+++ b/zluda_dump/src/format.rs
@@ -114,6 +114,17 @@ impl CudaDisplay for f32 {
     }
 }
 
+impl CudaDisplay for f64 {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        write!(writer, "{}", *self)
+    }
+}
+
 pub fn write_handle<T: LowerHex>(
     this: &[T; 64],
     writer: &mut (impl std::io::Write + ?Sized),
@@ -257,6 +268,69 @@ impl CudaDisplay for CUstreamBatchMemOpParams {
     }
 }
 
+impl CudaDisplay for CUcheckpointRestoreArgs_st {
+    fn write(
+        &self,
+        fn_name: &'static str,
+        index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        CudaDisplay::write(&self.reserved, fn_name, index, writer)
+    }
+}
+
+impl CudaDisplay for CUcheckpointUnlockArgs_st {
+    fn write(
+        &self,
+        fn_name: &'static str,
+        index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        CudaDisplay::write(&self.reserved, fn_name, index, writer)
+    }
+}
+
+impl CudaDisplay for CUcheckpointCheckpointArgs_st {
+    fn write(
+        &self,
+        fn_name: &'static str,
+        index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        CudaDisplay::write(&self.reserved, fn_name, index, writer)
+    }
+}
+
+impl CudaDisplay for CUmemcpy3DOperand_st {
+    fn write(
+        &self,
+        fn_name: &'static str,
+        index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        writer.write_all(b"{ type_: ")?;
+        CudaDisplay::write(&self.type_, "", 0, writer)?;
+        writer.write_all(b", op: ")?;
+        match self.type_ {
+            CUmemcpy3DOperandType::CU_MEMCPY_OPERAND_TYPE_ARRAY => {
+                CudaDisplay::write(unsafe { &self.op.array }, fn_name, index, writer)?;
+            }
+            CUmemcpy3DOperandType::CU_MEMCPY_OPERAND_TYPE_POINTER => {
+                CudaDisplay::write(unsafe { &self.op.ptr }, fn_name, index, writer)?;
+            }
+            _ => {
+                CudaDisplay::write(
+                    &unsafe { mem::transmute::<_, [u8; 32]>(self.op) },
+                    fn_name,
+                    index,
+                    writer,
+                )?;
+            }
+        }
+        writer.write_all(b" }")
+    }
+}
+
 pub fn write_wait_value(
     this: &CUstreamBatchMemOpParams_union_CUstreamMemOpWaitValueParams_st,
     writer: &mut (impl std::io::Write + ?Sized),
@@ -347,29 +421,29 @@ impl CudaDisplay for CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st {
         match self.type_ {
             CUexternalMemoryHandleType::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD => {
                 writer.write_all(b", handle: ")?;
-                CudaDisplay::write(unsafe { &self.handle.fd }, "", 0,writer)?;
+                CudaDisplay::write(unsafe { &self.handle.fd }, "", 0, writer)?;
             }
             CUexternalMemoryHandleType::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32
             | CUexternalMemoryHandleType::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP
             | CUexternalMemoryHandleType::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE
-            |CUexternalMemoryHandleType::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE => {
+            | CUexternalMemoryHandleType::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE => {
                 write_win32_handle(unsafe { self.handle.win32 }, writer)?;
             }
             CUexternalMemoryHandleType::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT
             | CUexternalMemoryHandleType::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT => {
                 writer.write_all(b", handle: ")?;
-                CudaDisplay::write(unsafe { &self.handle.win32.handle }, "", 0,writer)?;
+                CudaDisplay::write(unsafe { &self.handle.win32.handle }, "", 0, writer)?;
             }
             CUexternalMemoryHandleType::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF => {
                 writer.write_all(b", handle: ")?;
-                CudaDisplay::write(unsafe { &self.handle.nvSciBufObject }, "", 0,writer)?;
+                CudaDisplay::write(unsafe { &self.handle.nvSciBufObject }, "", 0, writer)?;
             }
             _ => {
                 writer.write_all(b", size: ")?;
-                CudaDisplay::write(&self.size, "", 0,writer)?;
+                CudaDisplay::write(&self.size, "", 0, writer)?;
                 writer.write_all(b", flags: ")?;
-                CudaDisplay::write(&self.flags, "", 0,writer)?;
-                return writer.write_all(b", ... }")
+                CudaDisplay::write(&self.flags, "", 0, writer)?;
+                return writer.write_all(b", ... }");
             }
         }
         writer.write_all(b", size: ")?;
@@ -441,9 +515,7 @@ impl CudaDisplay for CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st {
     }
 }
 
-impl CudaDisplay
-    for CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_2
-{
+impl CudaDisplay for CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_2 {
     fn write(
         &self,
         _fn_name: &'static str,
@@ -456,9 +528,7 @@ impl CudaDisplay
     }
 }
 
-impl CudaDisplay
-    for CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_2
-{
+impl CudaDisplay for CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_2 {
     fn write(
         &self,
         _fn_name: &'static str,
@@ -667,15 +737,30 @@ fn write_launch_attribute(
         }
         CUlaunchAttributeID::CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE => {
             writer.write_all(b", value_out: ")?;
-            CudaDisplay::write(unsafe { &(*value_out).clusterSchedulingPolicyPreference }, fn_name, index, writer)
+            CudaDisplay::write(
+                unsafe { &(*value_out).clusterSchedulingPolicyPreference },
+                fn_name,
+                index,
+                writer,
+            )
         }
         CUlaunchAttributeID::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION => {
             writer.write_all(b", value_out: ")?;
-            CudaDisplay::write(unsafe { &(*value_out).programmaticStreamSerializationAllowed }, fn_name, index, writer)
+            CudaDisplay::write(
+                unsafe { &(*value_out).programmaticStreamSerializationAllowed },
+                fn_name,
+                index,
+                writer,
+            )
         }
         CUlaunchAttributeID::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT => {
             writer.write_all(b", value_out: ")?;
-            CudaDisplay::write(unsafe { &(*value_out).programmaticEvent }, fn_name, index, writer)
+            CudaDisplay::write(
+                unsafe { &(*value_out).programmaticEvent },
+                fn_name,
+                index,
+                writer,
+            )
         }
         CUlaunchAttributeID::CU_LAUNCH_ATTRIBUTE_PRIORITY => {
             writer.write_all(b", value_out: ")?;
@@ -683,19 +768,39 @@ fn write_launch_attribute(
         }
         CUlaunchAttributeID::CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP => {
             writer.write_all(b", value_out: ")?;
-            CudaDisplay::write(unsafe { &(*value_out).memSyncDomainMap }, fn_name, index, writer)
+            CudaDisplay::write(
+                unsafe { &(*value_out).memSyncDomainMap },
+                fn_name,
+                index,
+                writer,
+            )
         }
         CUlaunchAttributeID::CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN => {
             writer.write_all(b", value_out: ")?;
-            CudaDisplay::write(unsafe { &(*value_out).memSyncDomain }, fn_name, index, writer)
+            CudaDisplay::write(
+                unsafe { &(*value_out).memSyncDomain },
+                fn_name,
+                index,
+                writer,
+            )
         }
         CUlaunchAttributeID::CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT => {
             writer.write_all(b", value_out: ")?;
-            CudaDisplay::write(unsafe { &(*value_out).launchCompletionEvent }, fn_name, index, writer)
+            CudaDisplay::write(
+                unsafe { &(*value_out).launchCompletionEvent },
+                fn_name,
+                index,
+                writer,
+            )
         }
         CUlaunchAttributeID::CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE => {
             writer.write_all(b", value_out: ")?;
-            CudaDisplay::write(unsafe { &(*value_out).deviceUpdatableKernelNode }, fn_name, index, writer)
+            CudaDisplay::write(
+                unsafe { &(*value_out).deviceUpdatableKernelNode },
+                fn_name,
+                index,
+                writer,
+            )
         }
         _ => writer.write_all(b", ... "),
     }
diff --git a/zluda_dump/src/format_generated.rs b/zluda_dump/src/format_generated.rs
index ed5eb49..cd65fee 100644
--- a/zluda_dump/src/format_generated.rs
+++ b/zluda_dump/src/format_generated.rs
@@ -221,6 +221,16 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUasyncCallbackHandle {
         write!(writer, "{:p}", *self)
     }
 }
+impl crate::format::CudaDisplay for cuda_types::cuda::CUgreenCtx {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        write!(writer, "{:p}", *self)
+    }
+}
 impl crate::format::CudaDisplay for cuda_types::cuda::CUmemFabricHandle_st {
     fn write(
         &self,
@@ -794,6 +804,63 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUarray_format_enum {
             &cuda_types::cuda::CUarray_format_enum::CU_AD_FORMAT_BC7_UNORM_SRGB => {
                 writer.write_all(stringify!(CU_AD_FORMAT_BC7_UNORM_SRGB).as_bytes())
             }
+            &cuda_types::cuda::CUarray_format_enum::CU_AD_FORMAT_P010 => {
+                writer.write_all(stringify!(CU_AD_FORMAT_P010).as_bytes())
+            }
+            &cuda_types::cuda::CUarray_format_enum::CU_AD_FORMAT_P016 => {
+                writer.write_all(stringify!(CU_AD_FORMAT_P016).as_bytes())
+            }
+            &cuda_types::cuda::CUarray_format_enum::CU_AD_FORMAT_NV16 => {
+                writer.write_all(stringify!(CU_AD_FORMAT_NV16).as_bytes())
+            }
+            &cuda_types::cuda::CUarray_format_enum::CU_AD_FORMAT_P210 => {
+                writer.write_all(stringify!(CU_AD_FORMAT_P210).as_bytes())
+            }
+            &cuda_types::cuda::CUarray_format_enum::CU_AD_FORMAT_P216 => {
+                writer.write_all(stringify!(CU_AD_FORMAT_P216).as_bytes())
+            }
+            &cuda_types::cuda::CUarray_format_enum::CU_AD_FORMAT_YUY2 => {
+                writer.write_all(stringify!(CU_AD_FORMAT_YUY2).as_bytes())
+            }
+            &cuda_types::cuda::CUarray_format_enum::CU_AD_FORMAT_Y210 => {
+                writer.write_all(stringify!(CU_AD_FORMAT_Y210).as_bytes())
+            }
+            &cuda_types::cuda::CUarray_format_enum::CU_AD_FORMAT_Y216 => {
+                writer.write_all(stringify!(CU_AD_FORMAT_Y216).as_bytes())
+            }
+            &cuda_types::cuda::CUarray_format_enum::CU_AD_FORMAT_AYUV => {
+                writer.write_all(stringify!(CU_AD_FORMAT_AYUV).as_bytes())
+            }
+            &cuda_types::cuda::CUarray_format_enum::CU_AD_FORMAT_Y410 => {
+                writer.write_all(stringify!(CU_AD_FORMAT_Y410).as_bytes())
+            }
+            &cuda_types::cuda::CUarray_format_enum::CU_AD_FORMAT_Y416 => {
+                writer.write_all(stringify!(CU_AD_FORMAT_Y416).as_bytes())
+            }
+            &cuda_types::cuda::CUarray_format_enum::CU_AD_FORMAT_Y444_PLANAR8 => {
+                writer.write_all(stringify!(CU_AD_FORMAT_Y444_PLANAR8).as_bytes())
+            }
+            &cuda_types::cuda::CUarray_format_enum::CU_AD_FORMAT_Y444_PLANAR10 => {
+                writer.write_all(stringify!(CU_AD_FORMAT_Y444_PLANAR10).as_bytes())
+            }
+            &cuda_types::cuda::CUarray_format_enum::CU_AD_FORMAT_YUV444_8bit_SemiPlanar => {
+                writer
+                    .write_all(
+                        stringify!(CU_AD_FORMAT_YUV444_8bit_SemiPlanar).as_bytes(),
+                    )
+            }
+            &cuda_types::cuda::CUarray_format_enum::CU_AD_FORMAT_YUV444_16bit_SemiPlanar => {
+                writer
+                    .write_all(
+                        stringify!(CU_AD_FORMAT_YUV444_16bit_SemiPlanar).as_bytes(),
+                    )
+            }
+            &cuda_types::cuda::CUarray_format_enum::CU_AD_FORMAT_UNORM_INT_101010_2 => {
+                writer.write_all(stringify!(CU_AD_FORMAT_UNORM_INT_101010_2).as_bytes())
+            }
+            &cuda_types::cuda::CUarray_format_enum::CU_AD_FORMAT_MAX => {
+                writer.write_all(stringify!(CU_AD_FORMAT_MAX).as_bytes())
+            }
             _ => write!(writer, "{}", self.0),
         }
     }
@@ -1768,6 +1835,45 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUdevice_attribute_enum {
             &cuda_types::cuda::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID => {
                 writer.write_all(stringify!(CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID).as_bytes())
             }
+            &cuda_types::cuda::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_D3D12_CIG_SUPPORTED => {
+                writer
+                    .write_all(
+                        stringify!(CU_DEVICE_ATTRIBUTE_D3D12_CIG_SUPPORTED).as_bytes(),
+                    )
+            }
+            &cuda_types::cuda::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_ALGORITHM_MASK => {
+                writer
+                    .write_all(
+                        stringify!(CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_ALGORITHM_MASK)
+                            .as_bytes(),
+                    )
+            }
+            &cuda_types::cuda::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_MAXIMUM_LENGTH => {
+                writer
+                    .write_all(
+                        stringify!(CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_MAXIMUM_LENGTH)
+                            .as_bytes(),
+                    )
+            }
+            &cuda_types::cuda::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_GPU_PCI_DEVICE_ID => {
+                writer
+                    .write_all(
+                        stringify!(CU_DEVICE_ATTRIBUTE_GPU_PCI_DEVICE_ID).as_bytes(),
+                    )
+            }
+            &cuda_types::cuda::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_GPU_PCI_SUBSYSTEM_ID => {
+                writer
+                    .write_all(
+                        stringify!(CU_DEVICE_ATTRIBUTE_GPU_PCI_SUBSYSTEM_ID).as_bytes(),
+                    )
+            }
+            &cuda_types::cuda::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_HOST_NUMA_MULTINODE_IPC_SUPPORTED => {
+                writer
+                    .write_all(
+                        stringify!(CU_DEVICE_ATTRIBUTE_HOST_NUMA_MULTINODE_IPC_SUPPORTED)
+                            .as_bytes(),
+                    )
+            }
             &cuda_types::cuda::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_MAX => {
                 writer.write_all(stringify!(CU_DEVICE_ATTRIBUTE_MAX).as_bytes())
             }
@@ -1907,6 +2013,13 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUpointer_attribute_enum {
                         stringify!(CU_POINTER_ATTRIBUTE_MEMORY_BLOCK_ID).as_bytes(),
                     )
             }
+            &cuda_types::cuda::CUpointer_attribute_enum::CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE => {
+                writer
+                    .write_all(
+                        stringify!(CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE)
+                            .as_bytes(),
+                    )
+            }
             _ => write!(writer, "{}", self.0),
         }
     }
@@ -2413,9 +2526,27 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUjit_target_enum {
             &cuda_types::cuda::CUjit_target_enum::CU_TARGET_COMPUTE_90 => {
                 writer.write_all(stringify!(CU_TARGET_COMPUTE_90).as_bytes())
             }
+            &cuda_types::cuda::CUjit_target_enum::CU_TARGET_COMPUTE_100 => {
+                writer.write_all(stringify!(CU_TARGET_COMPUTE_100).as_bytes())
+            }
+            &cuda_types::cuda::CUjit_target_enum::CU_TARGET_COMPUTE_101 => {
+                writer.write_all(stringify!(CU_TARGET_COMPUTE_101).as_bytes())
+            }
+            &cuda_types::cuda::CUjit_target_enum::CU_TARGET_COMPUTE_120 => {
+                writer.write_all(stringify!(CU_TARGET_COMPUTE_120).as_bytes())
+            }
             &cuda_types::cuda::CUjit_target_enum::CU_TARGET_COMPUTE_90A => {
                 writer.write_all(stringify!(CU_TARGET_COMPUTE_90A).as_bytes())
             }
+            &cuda_types::cuda::CUjit_target_enum::CU_TARGET_COMPUTE_100A => {
+                writer.write_all(stringify!(CU_TARGET_COMPUTE_100A).as_bytes())
+            }
+            &cuda_types::cuda::CUjit_target_enum::CU_TARGET_COMPUTE_101A => {
+                writer.write_all(stringify!(CU_TARGET_COMPUTE_101A).as_bytes())
+            }
+            &cuda_types::cuda::CUjit_target_enum::CU_TARGET_COMPUTE_120A => {
+                writer.write_all(stringify!(CU_TARGET_COMPUTE_120A).as_bytes())
+            }
             _ => write!(writer, "{}", self.0),
         }
     }
@@ -2636,6 +2767,18 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUlimit_enum {
                 writer
                     .write_all(stringify!(CU_LIMIT_PERSISTING_L2_CACHE_SIZE).as_bytes())
             }
+            &cuda_types::cuda::CUlimit_enum::CU_LIMIT_SHMEM_SIZE => {
+                writer.write_all(stringify!(CU_LIMIT_SHMEM_SIZE).as_bytes())
+            }
+            &cuda_types::cuda::CUlimit_enum::CU_LIMIT_CIG_ENABLED => {
+                writer.write_all(stringify!(CU_LIMIT_CIG_ENABLED).as_bytes())
+            }
+            &cuda_types::cuda::CUlimit_enum::CU_LIMIT_CIG_SHMEM_FALLBACK_ENABLED => {
+                writer
+                    .write_all(
+                        stringify!(CU_LIMIT_CIG_SHMEM_FALLBACK_ENABLED).as_bytes(),
+                    )
+            }
             &cuda_types::cuda::CUlimit_enum::CU_LIMIT_MAX => {
                 writer.write_all(stringify!(CU_LIMIT_MAX).as_bytes())
             }
@@ -2913,6 +3056,9 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUgraphConditionalNodeType
             &cuda_types::cuda::CUgraphConditionalNodeType_enum::CU_GRAPH_COND_TYPE_WHILE => {
                 writer.write_all(stringify!(CU_GRAPH_COND_TYPE_WHILE).as_bytes())
             }
+            &cuda_types::cuda::CUgraphConditionalNodeType_enum::CU_GRAPH_COND_TYPE_SWITCH => {
+                writer.write_all(stringify!(CU_GRAPH_COND_TYPE_SWITCH).as_bytes())
+            }
             _ => write!(writer, "{}", self.0),
         }
     }
@@ -3066,6 +3212,13 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUgraphInstantiateResult_e
                             .as_bytes(),
                     )
             }
+            &cuda_types::cuda::CUgraphInstantiateResult_enum::CUDA_GRAPH_INSTANTIATE_CONDITIONAL_HANDLE_UNUSED => {
+                writer
+                    .write_all(
+                        stringify!(CUDA_GRAPH_INSTANTIATE_CONDITIONAL_HANDLE_UNUSED)
+                            .as_bytes(),
+                    )
+            }
             _ => write!(writer, "{}", self.0),
         }
     }
@@ -3245,6 +3398,13 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUlaunchAttributeID_enum {
                         stringify!(CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN).as_bytes(),
                     )
             }
+            &cuda_types::cuda::CUlaunchAttributeID_enum::CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION => {
+                writer
+                    .write_all(
+                        stringify!(CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION)
+                            .as_bytes(),
+                    )
+            }
             &cuda_types::cuda::CUlaunchAttributeID_enum::CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT => {
                 writer
                     .write_all(
@@ -3259,6 +3419,13 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUlaunchAttributeID_enum {
                             .as_bytes(),
                     )
             }
+            &cuda_types::cuda::CUlaunchAttributeID_enum::CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT => {
+                writer
+                    .write_all(
+                        stringify!(CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT)
+                            .as_bytes(),
+                    )
+            }
             &cuda_types::cuda::CUlaunchAttributeID_enum::CU_LAUNCH_ATTRIBUTE_MAX => {
                 writer.write_all(stringify!(CU_LAUNCH_ATTRIBUTE_MAX).as_bytes())
             }
@@ -3318,6 +3485,23 @@ for cuda_types::cuda::CUlaunchAttributeValue_union__bindgen_ty_3 {
 }
 impl crate::format::CudaDisplay
 for cuda_types::cuda::CUlaunchAttributeValue_union__bindgen_ty_4 {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        writer.write_all(concat!("{ ", stringify!(x), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.x, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(y), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.y, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(z), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.z, "", 0, writer)?;
+        writer.write_all(b" }")
+    }
+}
+impl crate::format::CudaDisplay
+for cuda_types::cuda::CUlaunchAttributeValue_union__bindgen_ty_5 {
     fn write(
         &self,
         _fn_name: &'static str,
@@ -3463,6 +3647,56 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUexecAffinitySmCount_st {
         writer.write_all(b" }")
     }
 }
+impl crate::format::CudaDisplay for cuda_types::cuda::CUcigDataType_enum {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        match self {
+            &cuda_types::cuda::CUcigDataType_enum::CIG_DATA_TYPE_D3D12_COMMAND_QUEUE => {
+                writer
+                    .write_all(stringify!(CIG_DATA_TYPE_D3D12_COMMAND_QUEUE).as_bytes())
+            }
+            _ => write!(writer, "{}", self.0),
+        }
+    }
+}
+impl crate::format::CudaDisplay for cuda_types::cuda::CUctxCigParam_st {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        writer.write_all(concat!("{ ", stringify!(sharedDataType), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.sharedDataType, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(sharedData), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.sharedData, "", 0, writer)?;
+        writer.write_all(b" }")
+    }
+}
+impl crate::format::CudaDisplay for cuda_types::cuda::CUctxCreateParams_st {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        writer
+            .write_all(concat!("{ ", stringify!(execAffinityParams), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.execAffinityParams, "", 0, writer)?;
+        writer
+            .write_all(
+                concat!(", ", stringify!(numExecAffinityParams), ": ").as_bytes(),
+            )?;
+        crate::format::CudaDisplay::write(&self.numExecAffinityParams, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(cigParams), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.cigParams, "", 0, writer)?;
+        writer.write_all(b" }")
+    }
+}
 impl crate::format::CudaDisplay for cuda_types::cuda::CUlibraryOption_enum {
     fn write(
         &self,
@@ -4160,6 +4394,24 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUtensorMapDataType_enum {
                         stringify!(CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ).as_bytes(),
                     )
             }
+            &cuda_types::cuda::CUtensorMapDataType_enum::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B => {
+                writer
+                    .write_all(
+                        stringify!(CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B).as_bytes(),
+                    )
+            }
+            &cuda_types::cuda::CUtensorMapDataType_enum::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B => {
+                writer
+                    .write_all(
+                        stringify!(CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B).as_bytes(),
+                    )
+            }
+            &cuda_types::cuda::CUtensorMapDataType_enum::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B => {
+                writer
+                    .write_all(
+                        stringify!(CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B).as_bytes(),
+                    )
+            }
             _ => write!(writer, "{}", self.0),
         }
     }
@@ -4205,6 +4457,25 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUtensorMapSwizzle_enum {
             &cuda_types::cuda::CUtensorMapSwizzle_enum::CU_TENSOR_MAP_SWIZZLE_128B => {
                 writer.write_all(stringify!(CU_TENSOR_MAP_SWIZZLE_128B).as_bytes())
             }
+            &cuda_types::cuda::CUtensorMapSwizzle_enum::CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B => {
+                writer
+                    .write_all(
+                        stringify!(CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B).as_bytes(),
+                    )
+            }
+            &cuda_types::cuda::CUtensorMapSwizzle_enum::CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B_FLIP_8B => {
+                writer
+                    .write_all(
+                        stringify!(CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B_FLIP_8B)
+                            .as_bytes(),
+                    )
+            }
+            &cuda_types::cuda::CUtensorMapSwizzle_enum::CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B => {
+                writer
+                    .write_all(
+                        stringify!(CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B).as_bytes(),
+                    )
+            }
             _ => write!(writer, "{}", self.0),
         }
     }
@@ -4259,6 +4530,27 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUtensorMapFloatOOBfill_en
         }
     }
 }
+impl crate::format::CudaDisplay for cuda_types::cuda::CUtensorMapIm2ColWideMode_enum {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        match self {
+            &cuda_types::cuda::CUtensorMapIm2ColWideMode_enum::CU_TENSOR_MAP_IM2COL_WIDE_MODE_W => {
+                writer.write_all(stringify!(CU_TENSOR_MAP_IM2COL_WIDE_MODE_W).as_bytes())
+            }
+            &cuda_types::cuda::CUtensorMapIm2ColWideMode_enum::CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128 => {
+                writer
+                    .write_all(
+                        stringify!(CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128).as_bytes(),
+                    )
+            }
+            _ => write!(writer, "{}", self.0),
+        }
+    }
+}
 impl crate::format::CudaDisplay
 for cuda_types::cuda::CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st {
     fn write(
@@ -4893,6 +5185,25 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUmemRangeHandleType_enum
         }
     }
 }
+impl crate::format::CudaDisplay for cuda_types::cuda::CUmemRangeFlags_enum {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        match self {
+            &cuda_types::cuda::CUmemRangeFlags_enum::CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE => {
+                writer
+                    .write_all(
+                        stringify!(CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE)
+                            .as_bytes(),
+                    )
+            }
+            _ => write!(writer, "{}", self.0),
+        }
+    }
+}
 impl crate::format::CudaDisplay for cuda_types::cuda::CUarraySparseSubresourceType_enum {
     fn write(
         &self,
@@ -5280,6 +5591,8 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUmemPoolProps_st {
         crate::format::CudaDisplay::write(&self.win32SecurityAttributes, "", 0, writer)?;
         writer.write_all(concat!(", ", stringify!(maxSize), ": ").as_bytes())?;
         crate::format::CudaDisplay::write(&self.maxSize, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(usage), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.usage, "", 0, writer)?;
         writer.write_all(b" }")
     }
 }
@@ -5704,6 +6017,220 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUdeviceNumaConfig_enum {
         }
     }
 }
+impl crate::format::CudaDisplay for cuda_types::cuda::CUprocessState_enum {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        match self {
+            &cuda_types::cuda::CUprocessState_enum::CU_PROCESS_STATE_RUNNING => {
+                writer.write_all(stringify!(CU_PROCESS_STATE_RUNNING).as_bytes())
+            }
+            &cuda_types::cuda::CUprocessState_enum::CU_PROCESS_STATE_LOCKED => {
+                writer.write_all(stringify!(CU_PROCESS_STATE_LOCKED).as_bytes())
+            }
+            &cuda_types::cuda::CUprocessState_enum::CU_PROCESS_STATE_CHECKPOINTED => {
+                writer.write_all(stringify!(CU_PROCESS_STATE_CHECKPOINTED).as_bytes())
+            }
+            &cuda_types::cuda::CUprocessState_enum::CU_PROCESS_STATE_FAILED => {
+                writer.write_all(stringify!(CU_PROCESS_STATE_FAILED).as_bytes())
+            }
+            _ => write!(writer, "{}", self.0),
+        }
+    }
+}
+impl crate::format::CudaDisplay for cuda_types::cuda::CUcheckpointLockArgs_st {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        writer.write_all(concat!("{ ", stringify!(timeoutMs), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.timeoutMs, "", 0, writer)?;
+        writer.write_all(b" }")
+    }
+}
+impl crate::format::CudaDisplay for cuda_types::cuda::CUmemcpyFlags_enum {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        match self {
+            &cuda_types::cuda::CUmemcpyFlags_enum::CU_MEMCPY_FLAG_DEFAULT => {
+                writer.write_all(stringify!(CU_MEMCPY_FLAG_DEFAULT).as_bytes())
+            }
+            &cuda_types::cuda::CUmemcpyFlags_enum::CU_MEMCPY_FLAG_PREFER_OVERLAP_WITH_COMPUTE => {
+                writer
+                    .write_all(
+                        stringify!(CU_MEMCPY_FLAG_PREFER_OVERLAP_WITH_COMPUTE).as_bytes(),
+                    )
+            }
+            _ => write!(writer, "{}", self.0),
+        }
+    }
+}
+impl crate::format::CudaDisplay for cuda_types::cuda::CUmemcpySrcAccessOrder_enum {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        match self {
+            &cuda_types::cuda::CUmemcpySrcAccessOrder_enum::CU_MEMCPY_SRC_ACCESS_ORDER_INVALID => {
+                writer
+                    .write_all(stringify!(CU_MEMCPY_SRC_ACCESS_ORDER_INVALID).as_bytes())
+            }
+            &cuda_types::cuda::CUmemcpySrcAccessOrder_enum::CU_MEMCPY_SRC_ACCESS_ORDER_STREAM => {
+                writer
+                    .write_all(stringify!(CU_MEMCPY_SRC_ACCESS_ORDER_STREAM).as_bytes())
+            }
+            &cuda_types::cuda::CUmemcpySrcAccessOrder_enum::CU_MEMCPY_SRC_ACCESS_ORDER_DURING_API_CALL => {
+                writer
+                    .write_all(
+                        stringify!(CU_MEMCPY_SRC_ACCESS_ORDER_DURING_API_CALL).as_bytes(),
+                    )
+            }
+            &cuda_types::cuda::CUmemcpySrcAccessOrder_enum::CU_MEMCPY_SRC_ACCESS_ORDER_ANY => {
+                writer.write_all(stringify!(CU_MEMCPY_SRC_ACCESS_ORDER_ANY).as_bytes())
+            }
+            &cuda_types::cuda::CUmemcpySrcAccessOrder_enum::CU_MEMCPY_SRC_ACCESS_ORDER_MAX => {
+                writer.write_all(stringify!(CU_MEMCPY_SRC_ACCESS_ORDER_MAX).as_bytes())
+            }
+            _ => write!(writer, "{}", self.0),
+        }
+    }
+}
+impl crate::format::CudaDisplay for cuda_types::cuda::CUmemcpyAttributes_st {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        writer.write_all(concat!("{ ", stringify!(srcAccessOrder), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.srcAccessOrder, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(srcLocHint), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.srcLocHint, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(dstLocHint), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.dstLocHint, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(flags), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.flags, "", 0, writer)?;
+        writer.write_all(b" }")
+    }
+}
+impl crate::format::CudaDisplay for cuda_types::cuda::CUmemcpy3DOperandType_enum {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        match self {
+            &cuda_types::cuda::CUmemcpy3DOperandType_enum::CU_MEMCPY_OPERAND_TYPE_POINTER => {
+                writer.write_all(stringify!(CU_MEMCPY_OPERAND_TYPE_POINTER).as_bytes())
+            }
+            &cuda_types::cuda::CUmemcpy3DOperandType_enum::CU_MEMCPY_OPERAND_TYPE_ARRAY => {
+                writer.write_all(stringify!(CU_MEMCPY_OPERAND_TYPE_ARRAY).as_bytes())
+            }
+            &cuda_types::cuda::CUmemcpy3DOperandType_enum::CU_MEMCPY_OPERAND_TYPE_MAX => {
+                writer.write_all(stringify!(CU_MEMCPY_OPERAND_TYPE_MAX).as_bytes())
+            }
+            _ => write!(writer, "{}", self.0),
+        }
+    }
+}
+impl crate::format::CudaDisplay for cuda_types::cuda::CUoffset3D_st {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        writer.write_all(concat!("{ ", stringify!(x), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.x, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(y), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.y, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(z), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.z, "", 0, writer)?;
+        writer.write_all(b" }")
+    }
+}
+impl crate::format::CudaDisplay for cuda_types::cuda::CUextent3D_st {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        writer.write_all(concat!("{ ", stringify!(width), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.width, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(height), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.height, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(depth), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.depth, "", 0, writer)?;
+        writer.write_all(b" }")
+    }
+}
+impl crate::format::CudaDisplay
+for cuda_types::cuda::CUmemcpy3DOperand_st__bindgen_ty_1__bindgen_ty_1 {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        writer.write_all(concat!("{ ", stringify!(ptr), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.ptr, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(rowLength), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.rowLength, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(layerHeight), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.layerHeight, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(locHint), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.locHint, "", 0, writer)?;
+        writer.write_all(b" }")
+    }
+}
+impl crate::format::CudaDisplay
+for cuda_types::cuda::CUmemcpy3DOperand_st__bindgen_ty_1__bindgen_ty_2 {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        writer.write_all(concat!("{ ", stringify!(array), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.array, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(offset), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.offset, "", 0, writer)?;
+        writer.write_all(b" }")
+    }
+}
+impl crate::format::CudaDisplay for cuda_types::cuda::CUDA_MEMCPY3D_BATCH_OP_st {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        writer.write_all(concat!("{ ", stringify!(src), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.src, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(dst), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.dst, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(extent), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.extent, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(srcAccessOrder), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.srcAccessOrder, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(flags), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.flags, "", 0, writer)?;
+        writer.write_all(b" }")
+    }
+}
 pub fn write_cuGetErrorString(
     writer: &mut (impl std::io::Write + ?Sized),
     error: cuda_types::cuda::CUresult,
@@ -6316,6 +6843,36 @@ pub fn write_cuCtxCreate_v3(
     crate::format::CudaDisplay::write(&dev, "cuCtxCreate_v3", arg_idx, writer)?;
     writer.write_all(b")")
 }
+pub fn write_cuCtxCreate_v4(
+    writer: &mut (impl std::io::Write + ?Sized),
+    pctx: *mut cuda_types::cuda::CUcontext,
+    ctxCreateParams: *mut cuda_types::cuda::CUctxCreateParams,
+    flags: ::core::ffi::c_uint,
+    dev: cuda_types::cuda::CUdevice,
+) -> std::io::Result<()> {
+    let mut arg_idx = 0usize;
+    writer.write_all(b"(")?;
+    writer.write_all(concat!(stringify!(pctx), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&pctx, "cuCtxCreate_v4", arg_idx, writer)?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(ctxCreateParams), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &ctxCreateParams,
+        "cuCtxCreate_v4",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(flags), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&flags, "cuCtxCreate_v4", arg_idx, writer)?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(dev), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&dev, "cuCtxCreate_v4", arg_idx, writer)?;
+    writer.write_all(b")")
+}
 pub fn write_cuCtxDestroy_v2(
     writer: &mut (impl std::io::Write + ?Sized),
     ctx: cuda_types::cuda::CUcontext,
@@ -6531,6 +7088,36 @@ pub fn write_cuCtxGetExecAffinity(
     crate::format::CudaDisplay::write(&type_, "cuCtxGetExecAffinity", arg_idx, writer)?;
     writer.write_all(b")")
 }
+pub fn write_cuCtxRecordEvent(
+    writer: &mut (impl std::io::Write + ?Sized),
+    hCtx: cuda_types::cuda::CUcontext,
+    hEvent: cuda_types::cuda::CUevent,
+) -> std::io::Result<()> {
+    let mut arg_idx = 0usize;
+    writer.write_all(b"(")?;
+    writer.write_all(concat!(stringify!(hCtx), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&hCtx, "cuCtxRecordEvent", arg_idx, writer)?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(hEvent), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&hEvent, "cuCtxRecordEvent", arg_idx, writer)?;
+    writer.write_all(b")")
+}
+pub fn write_cuCtxWaitEvent(
+    writer: &mut (impl std::io::Write + ?Sized),
+    hCtx: cuda_types::cuda::CUcontext,
+    hEvent: cuda_types::cuda::CUevent,
+) -> std::io::Result<()> {
+    let mut arg_idx = 0usize;
+    writer.write_all(b"(")?;
+    writer.write_all(concat!(stringify!(hCtx), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&hCtx, "cuCtxWaitEvent", arg_idx, writer)?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(hEvent), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&hEvent, "cuCtxWaitEvent", arg_idx, writer)?;
+    writer.write_all(b")")
+}
 pub fn write_cuCtxAttach(
     writer: &mut (impl std::io::Write + ?Sized),
     pctx: *mut cuda_types::cuda::CUcontext,
@@ -7289,6 +7876,21 @@ pub fn write_cuKernelGetFunction(
     crate::format::CudaDisplay::write(&kernel, "cuKernelGetFunction", arg_idx, writer)?;
     writer.write_all(b")")
 }
+pub fn write_cuKernelGetLibrary(
+    writer: &mut (impl std::io::Write + ?Sized),
+    pLib: *mut cuda_types::cuda::CUlibrary,
+    kernel: cuda_types::cuda::CUkernel,
+) -> std::io::Result<()> {
+    let mut arg_idx = 0usize;
+    writer.write_all(b"(")?;
+    writer.write_all(concat!(stringify!(pLib), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&pLib, "cuKernelGetLibrary", arg_idx, writer)?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(kernel), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&kernel, "cuKernelGetLibrary", arg_idx, writer)?;
+    writer.write_all(b")")
+}
 pub fn write_cuLibraryGetGlobal(
     writer: &mut (impl std::io::Write + ?Sized),
     dptr: *mut cuda_types::cuda::CUdeviceptr,
@@ -8834,6 +9436,156 @@ pub fn write_cuMemcpy3DPeerAsync_ptsz(
     )?;
     writer.write_all(b")")
 }
+pub fn write_cuMemcpyBatchAsync_ptsz(
+    writer: &mut (impl std::io::Write + ?Sized),
+    dsts: *mut cuda_types::cuda::CUdeviceptr,
+    srcs: *mut cuda_types::cuda::CUdeviceptr,
+    sizes: *mut usize,
+    count: usize,
+    attrs: *mut cuda_types::cuda::CUmemcpyAttributes,
+    attrsIdxs: *mut usize,
+    numAttrs: usize,
+    failIdx: *mut usize,
+    hStream: cuda_types::cuda::CUstream,
+) -> std::io::Result<()> {
+    let mut arg_idx = 0usize;
+    writer.write_all(b"(")?;
+    writer.write_all(concat!(stringify!(dsts), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &dsts,
+        "cuMemcpyBatchAsync_ptsz",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(srcs), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &srcs,
+        "cuMemcpyBatchAsync_ptsz",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(sizes), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &sizes,
+        "cuMemcpyBatchAsync_ptsz",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(count), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &count,
+        "cuMemcpyBatchAsync_ptsz",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(attrs), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &attrs,
+        "cuMemcpyBatchAsync_ptsz",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(attrsIdxs), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &attrsIdxs,
+        "cuMemcpyBatchAsync_ptsz",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(numAttrs), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &numAttrs,
+        "cuMemcpyBatchAsync_ptsz",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(failIdx), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &failIdx,
+        "cuMemcpyBatchAsync_ptsz",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(hStream), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &hStream,
+        "cuMemcpyBatchAsync_ptsz",
+        arg_idx,
+        writer,
+    )?;
+    writer.write_all(b")")
+}
+pub fn write_cuMemcpy3DBatchAsync_ptsz(
+    writer: &mut (impl std::io::Write + ?Sized),
+    numOps: usize,
+    opList: *mut cuda_types::cuda::CUDA_MEMCPY3D_BATCH_OP,
+    failIdx: *mut usize,
+    flags: ::core::ffi::c_ulonglong,
+    hStream: cuda_types::cuda::CUstream,
+) -> std::io::Result<()> {
+    let mut arg_idx = 0usize;
+    writer.write_all(b"(")?;
+    writer.write_all(concat!(stringify!(numOps), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &numOps,
+        "cuMemcpy3DBatchAsync_ptsz",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(opList), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &opList,
+        "cuMemcpy3DBatchAsync_ptsz",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(failIdx), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &failIdx,
+        "cuMemcpy3DBatchAsync_ptsz",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(flags), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &flags,
+        "cuMemcpy3DBatchAsync_ptsz",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(hStream), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &hStream,
+        "cuMemcpy3DBatchAsync_ptsz",
+        arg_idx,
+        writer,
+    )?;
+    writer.write_all(b")")
+}
 pub fn write_cuMemsetD8_v2_ptds(
     writer: &mut (impl std::io::Write + ?Sized),
     dstDevice: cuda_types::cuda::CUdeviceptr,
@@ -9704,6 +10456,110 @@ pub fn write_cuMemGetHandleForAddressRange(
     )?;
     writer.write_all(b")")
 }
+impl crate::format::CudaDisplay for cuda_types::cuda::CUmemDecompressAlgorithm_enum {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        match self {
+            &cuda_types::cuda::CUmemDecompressAlgorithm_enum::CU_MEM_DECOMPRESS_UNSUPPORTED => {
+                writer.write_all(stringify!(CU_MEM_DECOMPRESS_UNSUPPORTED).as_bytes())
+            }
+            &cuda_types::cuda::CUmemDecompressAlgorithm_enum::CU_MEM_DECOMPRESS_ALGORITHM_DEFLATE => {
+                writer
+                    .write_all(
+                        stringify!(CU_MEM_DECOMPRESS_ALGORITHM_DEFLATE).as_bytes(),
+                    )
+            }
+            &cuda_types::cuda::CUmemDecompressAlgorithm_enum::CU_MEM_DECOMPRESS_ALGORITHM_SNAPPY => {
+                writer
+                    .write_all(stringify!(CU_MEM_DECOMPRESS_ALGORITHM_SNAPPY).as_bytes())
+            }
+            _ => write!(writer, "{}", self.0),
+        }
+    }
+}
+impl crate::format::CudaDisplay for cuda_types::cuda::CUmemDecompressParams_st {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        writer.write_all(concat!("{ ", stringify!(srcNumBytes), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.srcNumBytes, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(dstNumBytes), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.dstNumBytes, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(dstActBytes), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.dstActBytes, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(src), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.src, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(dst), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.dst, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(algo), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.algo, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(padding), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.padding, "", 0, writer)?;
+        writer.write_all(b" }")
+    }
+}
+pub fn write_cuMemBatchDecompressAsync_ptsz(
+    writer: &mut (impl std::io::Write + ?Sized),
+    paramsArray: *mut cuda_types::cuda::CUmemDecompressParams,
+    count: usize,
+    flags: ::core::ffi::c_uint,
+    errorIndex: *mut usize,
+    stream: cuda_types::cuda::CUstream,
+) -> std::io::Result<()> {
+    let mut arg_idx = 0usize;
+    writer.write_all(b"(")?;
+    writer.write_all(concat!(stringify!(paramsArray), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &paramsArray,
+        "cuMemBatchDecompressAsync_ptsz",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(count), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &count,
+        "cuMemBatchDecompressAsync_ptsz",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(flags), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &flags,
+        "cuMemBatchDecompressAsync_ptsz",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(errorIndex), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &errorIndex,
+        "cuMemBatchDecompressAsync_ptsz",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(stream), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &stream,
+        "cuMemBatchDecompressAsync_ptsz",
+        arg_idx,
+        writer,
+    )?;
+    writer.write_all(b")")
+}
 pub fn write_cuMemAddressReserve(
     writer: &mut (impl std::io::Write + ?Sized),
     ptr: *mut cuda_types::cuda::CUdeviceptr,
@@ -11046,6 +11902,31 @@ pub fn write_cuStreamGetPriority_ptsz(
     )?;
     writer.write_all(b")")
 }
+pub fn write_cuStreamGetDevice_ptsz(
+    writer: &mut (impl std::io::Write + ?Sized),
+    hStream: cuda_types::cuda::CUstream,
+    device: *mut cuda_types::cuda::CUdevice,
+) -> std::io::Result<()> {
+    let mut arg_idx = 0usize;
+    writer.write_all(b"(")?;
+    writer.write_all(concat!(stringify!(hStream), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &hStream,
+        "cuStreamGetDevice_ptsz",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(device), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &device,
+        "cuStreamGetDevice_ptsz",
+        arg_idx,
+        writer,
+    )?;
+    writer.write_all(b")")
+}
 pub fn write_cuStreamGetFlags_ptsz(
     writer: &mut (impl std::io::Write + ?Sized),
     hStream: cuda_types::cuda::CUstream,
@@ -11096,6 +11977,36 @@ pub fn write_cuStreamGetCtx_ptsz(
     crate::format::CudaDisplay::write(&pctx, "cuStreamGetCtx_ptsz", arg_idx, writer)?;
     writer.write_all(b")")
 }
+pub fn write_cuStreamGetCtx_v2_ptsz(
+    writer: &mut (impl std::io::Write + ?Sized),
+    hStream: cuda_types::cuda::CUstream,
+    pCtx: *mut cuda_types::cuda::CUcontext,
+    pGreenCtx: *mut cuda_types::cuda::CUgreenCtx,
+) -> std::io::Result<()> {
+    let mut arg_idx = 0usize;
+    writer.write_all(b"(")?;
+    writer.write_all(concat!(stringify!(hStream), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &hStream,
+        "cuStreamGetCtx_v2_ptsz",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(pCtx), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&pCtx, "cuStreamGetCtx_v2_ptsz", arg_idx, writer)?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(pGreenCtx), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &pGreenCtx,
+        "cuStreamGetCtx_v2_ptsz",
+        arg_idx,
+        writer,
+    )?;
+    writer.write_all(b")")
+}
 pub fn write_cuStreamWaitEvent_ptsz(
     writer: &mut (impl std::io::Write + ?Sized),
     hStream: cuda_types::cuda::CUstream,
@@ -11796,6 +12707,36 @@ pub fn write_cuEventElapsedTime(
     crate::format::CudaDisplay::write(&hEnd, "cuEventElapsedTime", arg_idx, writer)?;
     writer.write_all(b")")
 }
+pub fn write_cuEventElapsedTime_v2(
+    writer: &mut (impl std::io::Write + ?Sized),
+    pMilliseconds: *mut f32,
+    hStart: cuda_types::cuda::CUevent,
+    hEnd: cuda_types::cuda::CUevent,
+) -> std::io::Result<()> {
+    let mut arg_idx = 0usize;
+    writer.write_all(b"(")?;
+    writer.write_all(concat!(stringify!(pMilliseconds), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &pMilliseconds,
+        "cuEventElapsedTime_v2",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(hStart), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &hStart,
+        "cuEventElapsedTime_v2",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(hEnd), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&hEnd, "cuEventElapsedTime_v2", arg_idx, writer)?;
+    writer.write_all(b")")
+}
 pub fn write_cuImportExternalMemory(
     writer: &mut (impl std::io::Write + ?Sized),
     extMem_out: *mut cuda_types::cuda::CUexternalMemory,
@@ -17333,6 +18274,171 @@ pub fn write_cuTensorMapEncodeIm2col(
     )?;
     writer.write_all(b")")
 }
+pub fn write_cuTensorMapEncodeIm2colWide(
+    writer: &mut (impl std::io::Write + ?Sized),
+    tensorMap: *mut cuda_types::cuda::CUtensorMap,
+    tensorDataType: cuda_types::cuda::CUtensorMapDataType,
+    tensorRank: cuda_types::cuda::cuuint32_t,
+    globalAddress: *mut ::core::ffi::c_void,
+    globalDim: *const cuda_types::cuda::cuuint64_t,
+    globalStrides: *const cuda_types::cuda::cuuint64_t,
+    pixelBoxLowerCornerWidth: ::core::ffi::c_int,
+    pixelBoxUpperCornerWidth: ::core::ffi::c_int,
+    channelsPerPixel: cuda_types::cuda::cuuint32_t,
+    pixelsPerColumn: cuda_types::cuda::cuuint32_t,
+    elementStrides: *const cuda_types::cuda::cuuint32_t,
+    interleave: cuda_types::cuda::CUtensorMapInterleave,
+    mode: cuda_types::cuda::CUtensorMapIm2ColWideMode,
+    swizzle: cuda_types::cuda::CUtensorMapSwizzle,
+    l2Promotion: cuda_types::cuda::CUtensorMapL2promotion,
+    oobFill: cuda_types::cuda::CUtensorMapFloatOOBfill,
+) -> std::io::Result<()> {
+    let mut arg_idx = 0usize;
+    writer.write_all(b"(")?;
+    writer.write_all(concat!(stringify!(tensorMap), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &tensorMap,
+        "cuTensorMapEncodeIm2colWide",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(tensorDataType), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &tensorDataType,
+        "cuTensorMapEncodeIm2colWide",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(tensorRank), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &tensorRank,
+        "cuTensorMapEncodeIm2colWide",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(globalAddress), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &globalAddress,
+        "cuTensorMapEncodeIm2colWide",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(globalDim), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &globalDim,
+        "cuTensorMapEncodeIm2colWide",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(globalStrides), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &globalStrides,
+        "cuTensorMapEncodeIm2colWide",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(pixelBoxLowerCornerWidth), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &pixelBoxLowerCornerWidth,
+        "cuTensorMapEncodeIm2colWide",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(pixelBoxUpperCornerWidth), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &pixelBoxUpperCornerWidth,
+        "cuTensorMapEncodeIm2colWide",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(channelsPerPixel), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &channelsPerPixel,
+        "cuTensorMapEncodeIm2colWide",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(pixelsPerColumn), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &pixelsPerColumn,
+        "cuTensorMapEncodeIm2colWide",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(elementStrides), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &elementStrides,
+        "cuTensorMapEncodeIm2colWide",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(interleave), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &interleave,
+        "cuTensorMapEncodeIm2colWide",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(mode), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &mode,
+        "cuTensorMapEncodeIm2colWide",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(swizzle), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &swizzle,
+        "cuTensorMapEncodeIm2colWide",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(l2Promotion), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &l2Promotion,
+        "cuTensorMapEncodeIm2colWide",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(oobFill), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &oobFill,
+        "cuTensorMapEncodeIm2colWide",
+        arg_idx,
+        writer,
+    )?;
+    writer.write_all(b")")
+}
 pub fn write_cuTensorMapReplaceAddress(
     writer: &mut (impl std::io::Write + ?Sized),
     tensorMap: *mut cuda_types::cuda::CUtensorMap,
@@ -17749,6 +18855,9 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUcoredumpSettings_enum {
             &cuda_types::cuda::CUcoredumpSettings_enum::CU_COREDUMP_PIPE => {
                 writer.write_all(stringify!(CU_COREDUMP_PIPE).as_bytes())
             }
+            &cuda_types::cuda::CUcoredumpSettings_enum::CU_COREDUMP_GENERATION_FLAGS => {
+                writer.write_all(stringify!(CU_COREDUMP_GENERATION_FLAGS).as_bytes())
+            }
             &cuda_types::cuda::CUcoredumpSettings_enum::CU_COREDUMP_MAX => {
                 writer.write_all(stringify!(CU_COREDUMP_MAX).as_bytes())
             }
@@ -17756,6 +18865,46 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUcoredumpSettings_enum {
         }
     }
 }
+impl crate::format::CudaDisplay for cuda_types::cuda::CUCoredumpGenerationFlags {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        match self {
+            &cuda_types::cuda::CUCoredumpGenerationFlags::CU_COREDUMP_DEFAULT_FLAGS => {
+                writer.write_all(stringify!(CU_COREDUMP_DEFAULT_FLAGS).as_bytes())
+            }
+            &cuda_types::cuda::CUCoredumpGenerationFlags::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES => {
+                writer
+                    .write_all(
+                        stringify!(CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES).as_bytes(),
+                    )
+            }
+            &cuda_types::cuda::CUCoredumpGenerationFlags::CU_COREDUMP_SKIP_GLOBAL_MEMORY => {
+                writer.write_all(stringify!(CU_COREDUMP_SKIP_GLOBAL_MEMORY).as_bytes())
+            }
+            &cuda_types::cuda::CUCoredumpGenerationFlags::CU_COREDUMP_SKIP_SHARED_MEMORY => {
+                writer.write_all(stringify!(CU_COREDUMP_SKIP_SHARED_MEMORY).as_bytes())
+            }
+            &cuda_types::cuda::CUCoredumpGenerationFlags::CU_COREDUMP_SKIP_LOCAL_MEMORY => {
+                writer.write_all(stringify!(CU_COREDUMP_SKIP_LOCAL_MEMORY).as_bytes())
+            }
+            &cuda_types::cuda::CUCoredumpGenerationFlags::CU_COREDUMP_SKIP_ABORT => {
+                writer.write_all(stringify!(CU_COREDUMP_SKIP_ABORT).as_bytes())
+            }
+            &cuda_types::cuda::CUCoredumpGenerationFlags::CU_COREDUMP_SKIP_CONSTBANK_MEMORY => {
+                writer
+                    .write_all(stringify!(CU_COREDUMP_SKIP_CONSTBANK_MEMORY).as_bytes())
+            }
+            &cuda_types::cuda::CUCoredumpGenerationFlags::CU_COREDUMP_LIGHTWEIGHT_FLAGS => {
+                writer.write_all(stringify!(CU_COREDUMP_LIGHTWEIGHT_FLAGS).as_bytes())
+            }
+            _ => write!(writer, "{}", self.0),
+        }
+    }
+}
 pub fn write_cuCoredumpGetAttribute(
     writer: &mut (impl std::io::Write + ?Sized),
     attrib: cuda_types::cuda::CUcoredumpSettings,
@@ -17911,16 +19060,6 @@ pub fn write_cuGetExportTable(
     )?;
     writer.write_all(b")")
 }
-impl crate::format::CudaDisplay for cuda_types::cuda::CUgreenCtx {
-    fn write(
-        &self,
-        _fn_name: &'static str,
-        _index: usize,
-        writer: &mut (impl std::io::Write + ?Sized),
-    ) -> std::io::Result<()> {
-        write!(writer, "{:p}", *self)
-    }
-}
 impl crate::format::CudaDisplay for cuda_types::cuda::CUdevResourceDesc {
     fn write(
         &self,
@@ -17946,6 +19085,32 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUgreenCtxCreate_flags {
         }
     }
 }
+impl crate::format::CudaDisplay for cuda_types::cuda::CUdevSmResourceSplit_flags {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        match self {
+            &cuda_types::cuda::CUdevSmResourceSplit_flags::CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING => {
+                writer
+                    .write_all(
+                        stringify!(CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING)
+                            .as_bytes(),
+                    )
+            }
+            &cuda_types::cuda::CUdevSmResourceSplit_flags::CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE => {
+                writer
+                    .write_all(
+                        stringify!(CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE)
+                            .as_bytes(),
+                    )
+            }
+            _ => write!(writer, "{}", self.0),
+        }
+    }
+}
 impl crate::format::CudaDisplay for cuda_types::cuda::CUdevResourceType {
     fn write(
         &self,
@@ -18274,6 +19439,51 @@ pub fn write_cuStreamGetGreenCtx(
     crate::format::CudaDisplay::write(&phCtx, "cuStreamGetGreenCtx", arg_idx, writer)?;
     writer.write_all(b")")
 }
+pub fn write_cuGreenCtxStreamCreate(
+    writer: &mut (impl std::io::Write + ?Sized),
+    phStream: *mut cuda_types::cuda::CUstream,
+    greenCtx: cuda_types::cuda::CUgreenCtx,
+    flags: ::core::ffi::c_uint,
+    priority: ::core::ffi::c_int,
+) -> std::io::Result<()> {
+    let mut arg_idx = 0usize;
+    writer.write_all(b"(")?;
+    writer.write_all(concat!(stringify!(phStream), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &phStream,
+        "cuGreenCtxStreamCreate",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(greenCtx), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &greenCtx,
+        "cuGreenCtxStreamCreate",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(flags), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &flags,
+        "cuGreenCtxStreamCreate",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(priority), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &priority,
+        "cuGreenCtxStreamCreate",
+        arg_idx,
+        writer,
+    )?;
+    writer.write_all(b")")
+}
 pub fn write_cuMemHostRegister(
     writer: &mut (impl std::io::Write + ?Sized),
     p: *mut ::core::ffi::c_void,
@@ -20437,6 +21647,101 @@ pub fn write_cuMemcpy3DPeerAsync(
     crate::format::CudaDisplay::write(&hStream, "cuMemcpy3DPeerAsync", arg_idx, writer)?;
     writer.write_all(b")")
 }
+pub fn write_cuMemcpyBatchAsync(
+    writer: &mut (impl std::io::Write + ?Sized),
+    dsts: *mut cuda_types::cuda::CUdeviceptr,
+    srcs: *mut cuda_types::cuda::CUdeviceptr,
+    sizes: *mut usize,
+    count: usize,
+    attrs: *mut cuda_types::cuda::CUmemcpyAttributes,
+    attrsIdxs: *mut usize,
+    numAttrs: usize,
+    failIdx: *mut usize,
+    hStream: cuda_types::cuda::CUstream,
+) -> std::io::Result<()> {
+    let mut arg_idx = 0usize;
+    writer.write_all(b"(")?;
+    writer.write_all(concat!(stringify!(dsts), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&dsts, "cuMemcpyBatchAsync", arg_idx, writer)?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(srcs), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&srcs, "cuMemcpyBatchAsync", arg_idx, writer)?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(sizes), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&sizes, "cuMemcpyBatchAsync", arg_idx, writer)?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(count), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&count, "cuMemcpyBatchAsync", arg_idx, writer)?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(attrs), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&attrs, "cuMemcpyBatchAsync", arg_idx, writer)?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(attrsIdxs), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &attrsIdxs,
+        "cuMemcpyBatchAsync",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(numAttrs), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&numAttrs, "cuMemcpyBatchAsync", arg_idx, writer)?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(failIdx), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&failIdx, "cuMemcpyBatchAsync", arg_idx, writer)?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(hStream), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&hStream, "cuMemcpyBatchAsync", arg_idx, writer)?;
+    writer.write_all(b")")
+}
+pub fn write_cuMemcpy3DBatchAsync(
+    writer: &mut (impl std::io::Write + ?Sized),
+    numOps: usize,
+    opList: *mut cuda_types::cuda::CUDA_MEMCPY3D_BATCH_OP,
+    failIdx: *mut usize,
+    flags: ::core::ffi::c_ulonglong,
+    hStream: cuda_types::cuda::CUstream,
+) -> std::io::Result<()> {
+    let mut arg_idx = 0usize;
+    writer.write_all(b"(")?;
+    writer.write_all(concat!(stringify!(numOps), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&numOps, "cuMemcpy3DBatchAsync", arg_idx, writer)?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(opList), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&opList, "cuMemcpy3DBatchAsync", arg_idx, writer)?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(failIdx), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &failIdx,
+        "cuMemcpy3DBatchAsync",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(flags), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&flags, "cuMemcpy3DBatchAsync", arg_idx, writer)?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(hStream), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &hStream,
+        "cuMemcpy3DBatchAsync",
+        arg_idx,
+        writer,
+    )?;
+    writer.write_all(b")")
+}
 pub fn write_cuMemsetD8Async(
     writer: &mut (impl std::io::Write + ?Sized),
     dstDevice: cuda_types::cuda::CUdeviceptr,
@@ -20677,6 +21982,21 @@ pub fn write_cuStreamGetFlags(
     crate::format::CudaDisplay::write(&flags, "cuStreamGetFlags", arg_idx, writer)?;
     writer.write_all(b")")
 }
+pub fn write_cuStreamGetDevice(
+    writer: &mut (impl std::io::Write + ?Sized),
+    hStream: cuda_types::cuda::CUstream,
+    device: *mut cuda_types::cuda::CUdevice,
+) -> std::io::Result<()> {
+    let mut arg_idx = 0usize;
+    writer.write_all(b"(")?;
+    writer.write_all(concat!(stringify!(hStream), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&hStream, "cuStreamGetDevice", arg_idx, writer)?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(device), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&device, "cuStreamGetDevice", arg_idx, writer)?;
+    writer.write_all(b")")
+}
 pub fn write_cuStreamGetCtx(
     writer: &mut (impl std::io::Write + ?Sized),
     hStream: cuda_types::cuda::CUstream,
@@ -20692,6 +22012,26 @@ pub fn write_cuStreamGetCtx(
     crate::format::CudaDisplay::write(&pctx, "cuStreamGetCtx", arg_idx, writer)?;
     writer.write_all(b")")
 }
+pub fn write_cuStreamGetCtx_v2(
+    writer: &mut (impl std::io::Write + ?Sized),
+    hStream: cuda_types::cuda::CUstream,
+    pCtx: *mut cuda_types::cuda::CUcontext,
+    pGreenCtx: *mut cuda_types::cuda::CUgreenCtx,
+) -> std::io::Result<()> {
+    let mut arg_idx = 0usize;
+    writer.write_all(b"(")?;
+    writer.write_all(concat!(stringify!(hStream), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&hStream, "cuStreamGetCtx_v2", arg_idx, writer)?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(pCtx), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&pCtx, "cuStreamGetCtx_v2", arg_idx, writer)?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(pGreenCtx), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&pGreenCtx, "cuStreamGetCtx_v2", arg_idx, writer)?;
+    writer.write_all(b")")
+}
 pub fn write_cuStreamWaitEvent(
     writer: &mut (impl std::io::Write + ?Sized),
     hStream: cuda_types::cuda::CUstream,
@@ -22855,6 +24195,61 @@ pub fn write_cuStreamUpdateCaptureDependencies_v2(
     )?;
     writer.write_all(b")")
 }
+pub fn write_cuMemBatchDecompressAsync(
+    writer: &mut (impl std::io::Write + ?Sized),
+    paramsArray: *mut cuda_types::cuda::CUmemDecompressParams,
+    count: usize,
+    flags: ::core::ffi::c_uint,
+    errorIndex: *mut usize,
+    stream: cuda_types::cuda::CUstream,
+) -> std::io::Result<()> {
+    let mut arg_idx = 0usize;
+    writer.write_all(b"(")?;
+    writer.write_all(concat!(stringify!(paramsArray), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &paramsArray,
+        "cuMemBatchDecompressAsync",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(count), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &count,
+        "cuMemBatchDecompressAsync",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(flags), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &flags,
+        "cuMemBatchDecompressAsync",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(errorIndex), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &errorIndex,
+        "cuMemBatchDecompressAsync",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(stream), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &stream,
+        "cuMemBatchDecompressAsync",
+        arg_idx,
+        writer,
+    )?;
+    writer.write_all(b")")
+}
 pub fn write_cuGetProcAddress(
     writer: &mut (impl std::io::Write + ?Sized),
     symbol: *const ::core::ffi::c_char,
@@ -22885,6 +24280,151 @@ pub fn write_cuGetProcAddress(
     crate::format::CudaDisplay::write(&flags, "cuGetProcAddress", arg_idx, writer)?;
     writer.write_all(b")")
 }
+pub fn write_cuCheckpointProcessGetRestoreThreadId(
+    writer: &mut (impl std::io::Write + ?Sized),
+    pid: ::core::ffi::c_int,
+    tid: *mut ::core::ffi::c_int,
+) -> std::io::Result<()> {
+    let mut arg_idx = 0usize;
+    writer.write_all(b"(")?;
+    writer.write_all(concat!(stringify!(pid), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &pid,
+        "cuCheckpointProcessGetRestoreThreadId",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(tid), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &tid,
+        "cuCheckpointProcessGetRestoreThreadId",
+        arg_idx,
+        writer,
+    )?;
+    writer.write_all(b")")
+}
+pub fn write_cuCheckpointProcessGetState(
+    writer: &mut (impl std::io::Write + ?Sized),
+    pid: ::core::ffi::c_int,
+    state: *mut cuda_types::cuda::CUprocessState,
+) -> std::io::Result<()> {
+    let mut arg_idx = 0usize;
+    writer.write_all(b"(")?;
+    writer.write_all(concat!(stringify!(pid), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &pid,
+        "cuCheckpointProcessGetState",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(state), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &state,
+        "cuCheckpointProcessGetState",
+        arg_idx,
+        writer,
+    )?;
+    writer.write_all(b")")
+}
+pub fn write_cuCheckpointProcessLock(
+    writer: &mut (impl std::io::Write + ?Sized),
+    pid: ::core::ffi::c_int,
+    args: *mut cuda_types::cuda::CUcheckpointLockArgs,
+) -> std::io::Result<()> {
+    let mut arg_idx = 0usize;
+    writer.write_all(b"(")?;
+    writer.write_all(concat!(stringify!(pid), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(&pid, "cuCheckpointProcessLock", arg_idx, writer)?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(args), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &args,
+        "cuCheckpointProcessLock",
+        arg_idx,
+        writer,
+    )?;
+    writer.write_all(b")")
+}
+pub fn write_cuCheckpointProcessCheckpoint(
+    writer: &mut (impl std::io::Write + ?Sized),
+    pid: ::core::ffi::c_int,
+    args: *mut cuda_types::cuda::CUcheckpointCheckpointArgs,
+) -> std::io::Result<()> {
+    let mut arg_idx = 0usize;
+    writer.write_all(b"(")?;
+    writer.write_all(concat!(stringify!(pid), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &pid,
+        "cuCheckpointProcessCheckpoint",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(args), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &args,
+        "cuCheckpointProcessCheckpoint",
+        arg_idx,
+        writer,
+    )?;
+    writer.write_all(b")")
+}
+pub fn write_cuCheckpointProcessRestore(
+    writer: &mut (impl std::io::Write + ?Sized),
+    pid: ::core::ffi::c_int,
+    args: *mut cuda_types::cuda::CUcheckpointRestoreArgs,
+) -> std::io::Result<()> {
+    let mut arg_idx = 0usize;
+    writer.write_all(b"(")?;
+    writer.write_all(concat!(stringify!(pid), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &pid,
+        "cuCheckpointProcessRestore",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(args), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &args,
+        "cuCheckpointProcessRestore",
+        arg_idx,
+        writer,
+    )?;
+    writer.write_all(b")")
+}
+pub fn write_cuCheckpointProcessUnlock(
+    writer: &mut (impl std::io::Write + ?Sized),
+    pid: ::core::ffi::c_int,
+    args: *mut cuda_types::cuda::CUcheckpointUnlockArgs,
+) -> std::io::Result<()> {
+    let mut arg_idx = 0usize;
+    writer.write_all(b"(")?;
+    writer.write_all(concat!(stringify!(pid), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &pid,
+        "cuCheckpointProcessUnlock",
+        arg_idx,
+        writer,
+    )?;
+    arg_idx += 1;
+    writer.write_all(b", ")?;
+    writer.write_all(concat!(stringify!(args), ": ").as_bytes())?;
+    crate::format::CudaDisplay::write(
+        &args,
+        "cuCheckpointProcessUnlock",
+        arg_idx,
+        writer,
+    )?;
+    writer.write_all(b")")
+}
 impl crate::format::CudaDisplay for cuda_types::cuda::CUoutput_mode_enum {
     fn write(
         &self,
@@ -23971,6 +25511,15 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUeglColorFormat_enum {
                             .as_bytes(),
                     )
             }
+            &cuda_types::cuda::CUeglColorFormat_enum::CU_EGL_COLOR_FORMAT_UYVY_709 => {
+                writer.write_all(stringify!(CU_EGL_COLOR_FORMAT_UYVY_709).as_bytes())
+            }
+            &cuda_types::cuda::CUeglColorFormat_enum::CU_EGL_COLOR_FORMAT_UYVY_709_ER => {
+                writer.write_all(stringify!(CU_EGL_COLOR_FORMAT_UYVY_709_ER).as_bytes())
+            }
+            &cuda_types::cuda::CUeglColorFormat_enum::CU_EGL_COLOR_FORMAT_UYVY_2020 => {
+                writer.write_all(stringify!(CU_EGL_COLOR_FORMAT_UYVY_2020).as_bytes())
+            }
             &cuda_types::cuda::CUeglColorFormat_enum::CU_EGL_COLOR_FORMAT_MAX => {
                 writer.write_all(stringify!(CU_EGL_COLOR_FORMAT_MAX).as_bytes())
             }
@@ -24690,6 +26239,172 @@ pub fn write_cuVDPAUCtxCreate(
     )?;
     writer.write_all(b")")
 }
+impl crate::format::CudaDisplay for cuda_types::cuda::cudaDataType_t {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        match self {
+            &cuda_types::cuda::cudaDataType_t::CUDA_R_16F => {
+                writer.write_all(stringify!(CUDA_R_16F).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_C_16F => {
+                writer.write_all(stringify!(CUDA_C_16F).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_R_16BF => {
+                writer.write_all(stringify!(CUDA_R_16BF).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_C_16BF => {
+                writer.write_all(stringify!(CUDA_C_16BF).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_R_32F => {
+                writer.write_all(stringify!(CUDA_R_32F).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_C_32F => {
+                writer.write_all(stringify!(CUDA_C_32F).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_R_64F => {
+                writer.write_all(stringify!(CUDA_R_64F).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_C_64F => {
+                writer.write_all(stringify!(CUDA_C_64F).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_R_4I => {
+                writer.write_all(stringify!(CUDA_R_4I).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_C_4I => {
+                writer.write_all(stringify!(CUDA_C_4I).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_R_4U => {
+                writer.write_all(stringify!(CUDA_R_4U).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_C_4U => {
+                writer.write_all(stringify!(CUDA_C_4U).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_R_8I => {
+                writer.write_all(stringify!(CUDA_R_8I).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_C_8I => {
+                writer.write_all(stringify!(CUDA_C_8I).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_R_8U => {
+                writer.write_all(stringify!(CUDA_R_8U).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_C_8U => {
+                writer.write_all(stringify!(CUDA_C_8U).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_R_16I => {
+                writer.write_all(stringify!(CUDA_R_16I).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_C_16I => {
+                writer.write_all(stringify!(CUDA_C_16I).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_R_16U => {
+                writer.write_all(stringify!(CUDA_R_16U).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_C_16U => {
+                writer.write_all(stringify!(CUDA_C_16U).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_R_32I => {
+                writer.write_all(stringify!(CUDA_R_32I).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_C_32I => {
+                writer.write_all(stringify!(CUDA_C_32I).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_R_32U => {
+                writer.write_all(stringify!(CUDA_R_32U).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_C_32U => {
+                writer.write_all(stringify!(CUDA_C_32U).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_R_64I => {
+                writer.write_all(stringify!(CUDA_R_64I).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_C_64I => {
+                writer.write_all(stringify!(CUDA_C_64I).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_R_64U => {
+                writer.write_all(stringify!(CUDA_R_64U).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_C_64U => {
+                writer.write_all(stringify!(CUDA_C_64U).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_R_8F_E4M3 => {
+                writer.write_all(stringify!(CUDA_R_8F_E4M3).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_R_8F_UE4M3 => {
+                writer.write_all(stringify!(CUDA_R_8F_UE4M3).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_R_8F_E5M2 => {
+                writer.write_all(stringify!(CUDA_R_8F_E5M2).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_R_8F_UE8M0 => {
+                writer.write_all(stringify!(CUDA_R_8F_UE8M0).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_R_6F_E2M3 => {
+                writer.write_all(stringify!(CUDA_R_6F_E2M3).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_R_6F_E3M2 => {
+                writer.write_all(stringify!(CUDA_R_6F_E3M2).as_bytes())
+            }
+            &cuda_types::cuda::cudaDataType_t::CUDA_R_4F_E2M1 => {
+                writer.write_all(stringify!(CUDA_R_4F_E2M1).as_bytes())
+            }
+            _ => write!(writer, "{}", self.0),
+        }
+    }
+}
+impl crate::format::CudaDisplay for cuda_types::cuda::libraryPropertyType_t {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        match self {
+            &cuda_types::cuda::libraryPropertyType_t::MAJOR_VERSION => {
+                writer.write_all(stringify!(MAJOR_VERSION).as_bytes())
+            }
+            &cuda_types::cuda::libraryPropertyType_t::MINOR_VERSION => {
+                writer.write_all(stringify!(MINOR_VERSION).as_bytes())
+            }
+            &cuda_types::cuda::libraryPropertyType_t::PATCH_LEVEL => {
+                writer.write_all(stringify!(PATCH_LEVEL).as_bytes())
+            }
+            _ => write!(writer, "{}", self.0),
+        }
+    }
+}
+impl crate::format::CudaDisplay for cuda_types::cuda::float2 {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        writer.write_all(concat!("{ ", stringify!(x), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.x, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(y), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.y, "", 0, writer)?;
+        writer.write_all(b" }")
+    }
+}
+impl crate::format::CudaDisplay for cuda_types::cuda::double2 {
+    fn write(
+        &self,
+        _fn_name: &'static str,
+        _index: usize,
+        writer: &mut (impl std::io::Write + ?Sized),
+    ) -> std::io::Result<()> {
+        writer.write_all(concat!("{ ", stringify!(x), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.x, "", 0, writer)?;
+        writer.write_all(concat!(", ", stringify!(y), ": ").as_bytes())?;
+        crate::format::CudaDisplay::write(&self.y, "", 0, writer)?;
+        writer.write_all(b" }")
+    }
+}
 impl crate::format::CudaDisplay for cuda_types::cuda::CUresult {
     fn write(
         &self,
@@ -24771,6 +26486,7 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUresult {
                         writer
                             .write_all("CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC".as_bytes())
                     }
+                    226 => writer.write_all("CUDA_ERROR_CONTAINED".as_bytes()),
                     300 => writer.write_all("CUDA_ERROR_INVALID_SOURCE".as_bytes()),
                     301 => writer.write_all("CUDA_ERROR_FILE_NOT_FOUND".as_bytes()),
                     302 => {
@@ -24841,6 +26557,7 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUresult {
                                 "CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE".as_bytes(),
                             )
                     }
+                    721 => writer.write_all("CUDA_ERROR_TENSOR_MEMORY_LEAK".as_bytes()),
                     800 => writer.write_all("CUDA_ERROR_NOT_PERMITTED".as_bytes()),
                     801 => writer.write_all("CUDA_ERROR_NOT_SUPPORTED".as_bytes()),
                     802 => writer.write_all("CUDA_ERROR_SYSTEM_NOT_READY".as_bytes()),
@@ -24923,6 +26640,7 @@ impl crate::format::CudaDisplay for cuda_types::cuda::CUresult {
                                 "CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION".as_bytes(),
                             )
                     }
+                    916 => writer.write_all("CUDA_ERROR_KEY_ROTATION".as_bytes()),
                     999 => writer.write_all("CUDA_ERROR_UNKNOWN".as_bytes()),
                     err => write!(writer, "{}", err),
                 }
diff --git a/zluda_fft/Cargo.toml b/zluda_fft/Cargo.toml
new file mode 100644
index 0000000..d56aa2d
--- /dev/null
+++ b/zluda_fft/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "zluda_fft"
+version = "0.0.0"
+edition = "2021"
+
+[lib]
+crate-type = ["cdylib"]
+name = "cufft"
+
+[dependencies]
+cuda_base = { path = "../cuda_base" }
+cuda_types = { path = "../cuda_types" }
+
+[package.metadata.zluda]
+linux_symlinks = [
+    "libcufft.so.11",
+]
diff --git a/zluda_fft/src/impl.rs b/zluda_fft/src/impl.rs
new file mode 100644
index 0000000..ece814e
--- /dev/null
+++ b/zluda_fft/src/impl.rs
@@ -0,0 +1,11 @@
+use cuda_types::cufft::cufftResult_t;
+
+#[cfg(debug_assertions)]
+pub(crate) fn unimplemented() -> cufftResult_t {
+    unimplemented!()
+}
+
+#[cfg(not(debug_assertions))]
+pub(crate) fn unimplemented() -> cufftResult_t {
+    cufftResult_t::CUFFT_NOT_SUPPORTED
+}
diff --git a/zluda_fft/src/lib.rs b/zluda_fft/src/lib.rs
new file mode 100644
index 0000000..6668b7d
--- /dev/null
+++ b/zluda_fft/src/lib.rs
@@ -0,0 +1,18 @@
+mod r#impl;
+
+macro_rules! unimplemented {
+    ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:ty;)*) => {
+        $(
+            #[cfg_attr(not(test), no_mangle)]
+            #[allow(improper_ctypes)]
+            #[allow(improper_ctypes_definitions)]
+            pub unsafe extern $abi fn $fn_name ( $( $arg_id : $arg_type),* ) -> $ret_type {
+                crate::r#impl::unimplemented()
+            }
+        )*
+    };
+}
+
+cuda_base::cufft_function_declarations!(
+    unimplemented
+);
diff --git a/zluda_sparse/Cargo.toml b/zluda_sparse/Cargo.toml
new file mode 100644
index 0000000..8218b81
--- /dev/null
+++ b/zluda_sparse/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "zluda_sparse"
+version = "0.0.0"
+edition = "2021"
+
+[lib]
+crate-type = ["cdylib"]
+name = "cusparse"
+
+[dependencies]
+cuda_base = { path = "../cuda_base" }
+cuda_types = { path = "../cuda_types" }
+
+[package.metadata.zluda]
+linux_symlinks = [
+    "libcusparse.so.12",
+]
diff --git a/zluda_sparse/src/impl.rs b/zluda_sparse/src/impl.rs
new file mode 100644
index 0000000..726a061
--- /dev/null
+++ b/zluda_sparse/src/impl.rs
@@ -0,0 +1,53 @@
+use cuda_types::cusparse::cusparseStatus_t;
+
+#[cfg(debug_assertions)]
+pub(crate) fn unimplemented() -> cusparseStatus_t {
+    unimplemented!()
+}
+
+#[cfg(not(debug_assertions))]
+pub(crate) fn unimplemented() -> cusparseStatus_t {
+    cusparseStatus_t::CUSPARSE_STATUS_NOT_SUPPORTED
+}
+
+#[allow(non_snake_case)]
+pub(crate) fn cusparseGetErrorName(
+    _status: cuda_types::cusparse::cusparseStatus_t,
+) -> *const ::core::ffi::c_char {
+    todo!()
+}
+
+#[allow(non_snake_case)]
+pub(crate) fn cusparseGetErrorString(
+    _status: cuda_types::cusparse::cusparseStatus_t,
+) -> *const ::core::ffi::c_char {
+    todo!()
+}
+
+#[allow(non_snake_case)]
+pub(crate) fn cusparseGetMatType(
+    _descrA: cuda_types::cusparse::cusparseMatDescr_t,
+) -> cuda_types::cusparse::cusparseMatrixType_t {
+    todo!()
+}
+
+#[allow(non_snake_case)]
+pub(crate) fn cusparseGetMatFillMode(
+    _descrA: cuda_types::cusparse::cusparseMatDescr_t,
+) -> cuda_types::cusparse::cusparseFillMode_t {
+    todo!()
+}
+
+#[allow(non_snake_case)]
+pub(crate) fn cusparseGetMatDiagType(
+    _descrA: cuda_types::cusparse::cusparseMatDescr_t,
+) -> cuda_types::cusparse::cusparseDiagType_t {
+    todo!()
+}
+
+#[allow(non_snake_case)]
+pub(crate) fn cusparseGetMatIndexBase(
+    _descrA: cuda_types::cusparse::cusparseMatDescr_t,
+) -> cuda_types::cusparse::cusparseIndexBase_t {
+    todo!()
+}
diff --git a/zluda_sparse/src/lib.rs b/zluda_sparse/src/lib.rs
new file mode 100644
index 0000000..1bddb6a
--- /dev/null
+++ b/zluda_sparse/src/lib.rs
@@ -0,0 +1,42 @@
+mod r#impl;
+
+pub enum FILE { }
+
+
+macro_rules! unimplemented {
+    ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:ty;)*) => {
+        $(
+            #[cfg_attr(not(test), no_mangle)]
+            #[allow(improper_ctypes)]
+            #[allow(improper_ctypes_definitions)]
+            pub unsafe extern $abi fn $fn_name ( $( $arg_id : $arg_type),* ) -> $ret_type {
+                crate::r#impl::unimplemented()
+            }
+        )*
+    };
+}
+
+macro_rules! implemented {
+    ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:ty;)*) => {
+        $(
+            #[cfg_attr(not(test), no_mangle)]
+            #[allow(improper_ctypes)]
+            #[allow(improper_ctypes_definitions)]
+            pub unsafe extern $abi fn $fn_name ( $( $arg_id : $arg_type),* ) -> $ret_type {
+                crate::r#impl::$fn_name( $( $arg_id ),* )
+            }
+        )*
+    };
+}
+
+cuda_base::cusparse_function_declarations!(
+    unimplemented,
+    implemented <= [
+        cusparseGetErrorName,
+        cusparseGetErrorString,
+        cusparseGetMatIndexBase,
+        cusparseGetMatDiagType,
+        cusparseGetMatFillMode,
+        cusparseGetMatType
+    ]
+);