feat: Update v5.3

NVIDIA · Jan 23, 2023 · acb8091 · acb8091
1 parent ab00df0
commit acb8091
Show file tree

Hide file tree

Showing 1,095 changed files with 189,245 additions and 6,551 deletions.
diff --git a/.clang-format b/.clang-format
@@ -2,8 +2,8 @@ Language: Cpp
 AccessModifierOffset: -4
 AlignAfterOpenBracket: Align
 AllowShortEnumsOnASingleLine: false
-AlignConsecutiveAssignments: false
-AlignConsecutiveDeclarations: false
+AlignConsecutiveAssignments: true
+AlignConsecutiveDeclarations: true
 AlignEscapedNewlines: Right
 AlignOperands: true
 AlignTrailingComments: true
@@ -59,4 +59,4 @@ SpacesInParentheses: false
 SpacesInSquareBrackets: false
 Standard: Cpp11
 TabWidth: 4
-UseTab: Never
+UseTab: Never
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,15 @@
+docker
+.dockerignore
+.gitlab
+.gitlab-ci.yml
+
+*build*
+./models
+__pycache__
+.vscode
+translation
+.cache
+*.npy
+*.pth
+*.o
+**/.ipynb_checkpoints
diff --git a/.gitignore b/.gitignore
@@ -4,10 +4,15 @@
 ./models/
 __pycache__/
 .vscode
+.idea
 ./translation
 .cache
 *.npy
 *.pth
 !tests/data/**/*.npy
 /models
-**/.ipynb_checkpoints/
+/notebooks
+**/.ipynb_checkpoints/
+
+/3rdparty/NeMo/
+/3rdparty/apex/
diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,4 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(trt_fused_multihead_attention)
+add_subdirectory(common)
+add_subdirectory(trt_fused_multihead_attention)
+if(ENABLE_FP8)
+add_subdirectory(fp8_qgmma_1x1)
+endif()
diff --git a/3rdparty/common/CMakeLists.txt b/3rdparty/common/CMakeLists.txt
@@ -0,0 +1,25 @@
+#
+# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+cmake_minimum_required(VERSION 3.8)
+
+set(cuda_driver_wrapper_files
+    cudaDriverWrapper.cpp
+)
+
+add_library(cuda_driver_wrapper STATIC ${cuda_driver_wrapper_files})
+target_link_libraries(cuda_driver_wrapper PRIVATE -lcublas -lcudart)
+set_property(TARGET cuda_driver_wrapper PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET cuda_driver_wrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
diff --git a/3rdparty/common/cudaDriverWrapper.cpp b/3rdparty/common/cudaDriverWrapper.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define CUDA_LIB_NAME "cuda"
+
+#if defined(_WIN32)
+#if !defined(WIN32_LEAN_AND_MEAN)
+#define WIN32_LEAN_AND_MEAN
+#endif // defined(WIN32_LEAN_AND_MEAN)
+#include <windows.h>
+#define dllOpen(name) (void*) LoadLibraryA("nv" name ".dll")
+#define dllClose(handle) FreeLibrary(static_cast<HMODULE>(handle))
+#define dllGetSym(handle, name) GetProcAddress(static_cast<HMODULE>(handle), name)
+#else
+#include <dlfcn.h>
+#define dllOpen(name) dlopen("lib" name ".so", RTLD_LAZY)
+#define dllClose(handle) dlclose(handle)
+#define dllGetSym(handle, name) dlsym(handle, name)
+#endif
+
+#include "cudaDriverWrapper.h"
+// #include "plugin.h"
+#include <cuda.h>
+#include <stdio.h>
+
+// using namespace nvinfer1;
+
+CUDADriverWrapper::CUDADriverWrapper()
+{
+    handle = dllOpen(CUDA_LIB_NAME);
+    // ASSERT(handle != nullptr); // TODO check
+
+    auto load_sym = [](void* handle, const char* name) {
+        void* ret = dllGetSym(handle, name);
+        // ASSERT(ret != nullptr); // TODO check
+        return ret;
+    };
+
+    *(void**) (&_cuGetErrorName) = load_sym(handle, "cuGetErrorName");
+    *(void**) (&_cuFuncSetAttribute) = load_sym(handle, "cuFuncSetAttribute");
+    *(void**) (&_cuLinkComplete) = load_sym(handle, "cuLinkComplete");
+    *(void**) (&_cuModuleUnload) = load_sym(handle, "cuModuleUnload");
+    *(void**) (&_cuLinkDestroy) = load_sym(handle, "cuLinkDestroy");
+    *(void**) (&_cuModuleLoadData) = load_sym(handle, "cuModuleLoadData");
+    *(void**) (&_cuLinkCreate) = load_sym(handle, "cuLinkCreate_v2");
+    *(void**) (&_cuModuleGetFunction) = load_sym(handle, "cuModuleGetFunction");
+    *(void**) (&_cuLinkAddFile) = load_sym(handle, "cuLinkAddFile_v2");
+    *(void**) (&_cuLinkAddData) = load_sym(handle, "cuLinkAddData_v2");
+    *(void**) (&_cuLaunchCooperativeKernel) = load_sym(handle, "cuLaunchCooperativeKernel");
+    *(void**) (&_cuLaunchKernel) = load_sym(handle, "cuLaunchKernel");
+}
+
+CUDADriverWrapper::~CUDADriverWrapper()
+{
+    dllClose(handle);
+}
+
+CUresult CUDADriverWrapper::cuGetErrorName(CUresult error, const char** pStr) const
+{
+    return (*_cuGetErrorName)(error, pStr);
+}
+
+CUresult CUDADriverWrapper::cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) const
+{
+    return (*_cuFuncSetAttribute)(hfunc, attrib, value);
+}
+
+CUresult CUDADriverWrapper::cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut) const
+{
+    return (*_cuLinkComplete)(state, cubinOut, sizeOut);
+}
+
+CUresult CUDADriverWrapper::cuModuleUnload(CUmodule hmod) const
+{
+    return (*_cuModuleUnload)(hmod);
+}
+
+CUresult CUDADriverWrapper::cuLinkDestroy(CUlinkState state) const
+{
+    return (*_cuLinkDestroy)(state);
+}
+
+CUresult CUDADriverWrapper::cuModuleLoadData(CUmodule* module, const void* image) const
+{
+    return (*_cuModuleLoadData)(module, image);
+}
+
+CUresult CUDADriverWrapper::cuLinkCreate(
+    unsigned int numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut) const
+{
+    return (*_cuLinkCreate)(numOptions, options, optionValues, stateOut);
+}
+
+CUresult CUDADriverWrapper::cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name) const
+{
+    return (*_cuModuleGetFunction)(hfunc, hmod, name);
+}
+
+CUresult CUDADriverWrapper::cuLinkAddFile(CUlinkState state, CUjitInputType type, const char* path,
+    unsigned int numOptions, CUjit_option* options, void** optionValues) const
+{
+    return (*_cuLinkAddFile)(state, type, path, numOptions, options, optionValues);
+}
+
+CUresult CUDADriverWrapper::cuLinkAddData(CUlinkState state, CUjitInputType type, void* data, size_t size,
+    const char* name, unsigned int numOptions, CUjit_option* options, void** optionValues) const
+{
+    return (*_cuLinkAddData)(state, type, data, size, name, numOptions, options, optionValues);
+}
+
+CUresult CUDADriverWrapper::cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
+    unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
+    unsigned int sharedMemBytes, CUstream hStream, void** kernelParams) const
+{
+    return (*_cuLaunchCooperativeKernel)(
+        f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams);
+}
+
+CUresult CUDADriverWrapper::cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
+    unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
+    unsigned int sharedMemBytes, CUstream hStream, void** kernelParams, void** extra) const
+{
+    return (*_cuLaunchKernel)(
+        f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
+}
diff --git a/3rdparty/common/cudaDriverWrapper.h b/3rdparty/common/cudaDriverWrapper.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CUDA_DRIVER_WRAPPER_H
+#define CUDA_DRIVER_WRAPPER_H
+
+#include <cstdio>
+#include <cuda.h>
+#pragma once
+
+#define cuErrCheck(stat, wrap)                                                                                         \
+    {                                                                                                                  \
+        cuErrCheck_((stat), wrap, __FILE__, __LINE__);                                                       \
+    }
+
+// namespace nvinfer1
+// {
+class CUDADriverWrapper
+{
+public:
+    CUDADriverWrapper();
+
+    ~CUDADriverWrapper();
+
+    CUresult cuGetErrorName(CUresult error, const char** pStr) const;
+
+    CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) const;
+
+    CUresult cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut) const;
+
+    CUresult cuModuleUnload(CUmodule hmod) const;
+
+    CUresult cuLinkDestroy(CUlinkState state) const;
+
+    CUresult cuModuleLoadData(CUmodule* module, const void* image) const;
+
+    CUresult cuLinkCreate(
+        unsigned int numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut) const;
+
+    CUresult cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name) const;
+
+    CUresult cuLinkAddFile(CUlinkState state, CUjitInputType type, const char* path, unsigned int numOptions,
+        CUjit_option* options, void** optionValues) const;
+
+    CUresult cuLinkAddData(CUlinkState state, CUjitInputType type, void* data, size_t size, const char* name,
+        unsigned int numOptions, CUjit_option* options, void** optionValues) const;
+
+    CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
+        unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
+        unsigned int sharedMemBytes, CUstream hStream, void** kernelParams) const;
+
+    CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
+        unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes,
+        CUstream hStream, void** kernelParams, void** extra) const;
+
+private:
+    void* handle;
+    CUresult (*_cuGetErrorName)(CUresult, const char**);
+    CUresult (*_cuFuncSetAttribute)(CUfunction, CUfunction_attribute, int);
+    CUresult (*_cuLinkComplete)(CUlinkState, void**, size_t*);
+    CUresult (*_cuModuleUnload)(CUmodule);
+    CUresult (*_cuLinkDestroy)(CUlinkState);
+    CUresult (*_cuLinkCreate)(unsigned int, CUjit_option*, void**, CUlinkState*);
+    CUresult (*_cuModuleLoadData)(CUmodule*, const void*);
+    CUresult (*_cuModuleGetFunction)(CUfunction*, CUmodule, const char*);
+    CUresult (*_cuLinkAddFile)(CUlinkState, CUjitInputType, const char*, unsigned int, CUjit_option*, void**);
+    CUresult (*_cuLinkAddData)(
+        CUlinkState, CUjitInputType, void*, size_t, const char*, unsigned int, CUjit_option*, void**);
+    CUresult (*_cuLaunchCooperativeKernel)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
+        unsigned int, unsigned int, unsigned int, CUstream, void**);
+    CUresult (*_cuLaunchKernel)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
+        unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes,
+        CUstream hStream, void** kernelParams, void** extra);
+};
+
+inline void cuErrCheck_(CUresult stat, const CUDADriverWrapper& wrap, const char* file, int line)
+{
+    if (stat != CUDA_SUCCESS)
+    {
+        const char* msg = nullptr;
+        wrap.cuGetErrorName(stat, &msg);
+        fprintf(stderr, "CUDA Error: %s %s %d\n", msg, file, line);
+    }
+}
+
+// } // namespace nvinfer1
+
+#endif // CUDA_DRIVER_WRAPPER_H
diff --git a/3rdparty/fp8_qgmma_1x1/CMakeLists.txt b/3rdparty/fp8_qgmma_1x1/CMakeLists.txt
@@ -0,0 +1,26 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.8)
+
+set(fp8_gemm_1x1_files
+    fp8_qgmma_1x1_utils.cu
+)
+
+file(GLOB fp8_gemm_1x1_files ${fp8_gemm_1x1_files} cubins/*.cubin.cpp)
+
+add_library(fp8_qgmma_1x1_utils STATIC ${fp8_gemm_1x1_files})
+target_link_libraries(fp8_qgmma_1x1_utils PUBLIC cuda_driver_wrapper)
+set_property(TARGET fp8_qgmma_1x1_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET fp8_qgmma_1x1_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
diff --git a/3rdparty/fp8_qgmma_1x1/compute.cuh b/3rdparty/fp8_qgmma_1x1/compute.cuh
@@ -0,0 +1,24 @@
+#include "fp8_gemm_1x1.h"
+
+struct Compute {
+    struct Host {
+        ComputeParams _params;
+        __host__ Host() {}
+
+        __host__ ComputeParams params() { return _params; }
+
+        __host__ void configure(uint8_t* D, int N, int P, int Q, int C, int K, float ab_scale, float d_scale)
+        {
+            _params.D = D;
+            _params.N = N;
+            _params.NPQ = N*P*Q;
+            _params.PQ = P*Q;
+            _params.P = P;
+            _params.Q = Q;
+            _params.C = C;
+            _params.K = K;
+            _params.ab_scale = ab_scale;
+            _params.d_scale = d_scale;
+        }
+    };
+};