Skip to content

Commit

Permalink
feat: Update v5.3
Browse files Browse the repository at this point in the history
  • Loading branch information
byshiue committed Jan 23, 2023
1 parent ab00df0 commit acb8091
Show file tree
Hide file tree
Showing 1,095 changed files with 189,245 additions and 6,551 deletions.
6 changes: 3 additions & 3 deletions .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ Language: Cpp
AccessModifierOffset: -4
AlignAfterOpenBracket: Align
AllowShortEnumsOnASingleLine: false
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignConsecutiveAssignments: true
AlignConsecutiveDeclarations: true
AlignEscapedNewlines: Right
AlignOperands: true
AlignTrailingComments: true
Expand Down Expand Up @@ -59,4 +59,4 @@ SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Cpp11
TabWidth: 4
UseTab: Never
UseTab: Never
15 changes: 15 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
docker
.dockerignore
.gitlab
.gitlab-ci.yml

*build*
./models
__pycache__
.vscode
translation
.cache
*.npy
*.pth
*.o
**/.ipynb_checkpoints
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,15 @@
./models/
__pycache__/
.vscode
.idea
./translation
.cache
*.npy
*.pth
!tests/data/**/*.npy
/models
**/.ipynb_checkpoints/
/notebooks
**/.ipynb_checkpoints/

/3rdparty/NeMo/
/3rdparty/apex/
8 changes: 6 additions & 2 deletions 3rdparty/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -12,4 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

add_subdirectory(trt_fused_multihead_attention)
add_subdirectory(common)
add_subdirectory(trt_fused_multihead_attention)
if(ENABLE_FP8)
add_subdirectory(fp8_qgmma_1x1)
endif()
25 changes: 25 additions & 0 deletions 3rdparty/common/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#
# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
cmake_minimum_required(VERSION 3.8)

set(cuda_driver_wrapper_files
cudaDriverWrapper.cpp
)

add_library(cuda_driver_wrapper STATIC ${cuda_driver_wrapper_files})
target_link_libraries(cuda_driver_wrapper PRIVATE -lcublas -lcudart)
set_property(TARGET cuda_driver_wrapper PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET cuda_driver_wrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
138 changes: 138 additions & 0 deletions 3rdparty/common/cudaDriverWrapper.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#define CUDA_LIB_NAME "cuda"

#if defined(_WIN32)
#if !defined(WIN32_LEAN_AND_MEAN)
#define WIN32_LEAN_AND_MEAN
#endif // defined(WIN32_LEAN_AND_MEAN)
#include <windows.h>
#define dllOpen(name) (void*) LoadLibraryA("nv" name ".dll")
#define dllClose(handle) FreeLibrary(static_cast<HMODULE>(handle))
#define dllGetSym(handle, name) GetProcAddress(static_cast<HMODULE>(handle), name)
#else
#include <dlfcn.h>
#define dllOpen(name) dlopen("lib" name ".so", RTLD_LAZY)
#define dllClose(handle) dlclose(handle)
#define dllGetSym(handle, name) dlsym(handle, name)
#endif

#include "cudaDriverWrapper.h"
// #include "plugin.h"
#include <cuda.h>
#include <stdio.h>

// using namespace nvinfer1;

CUDADriverWrapper::CUDADriverWrapper()
{
handle = dllOpen(CUDA_LIB_NAME);
// ASSERT(handle != nullptr); // TODO check

auto load_sym = [](void* handle, const char* name) {
void* ret = dllGetSym(handle, name);
// ASSERT(ret != nullptr); // TODO check
return ret;
};

*(void**) (&_cuGetErrorName) = load_sym(handle, "cuGetErrorName");
*(void**) (&_cuFuncSetAttribute) = load_sym(handle, "cuFuncSetAttribute");
*(void**) (&_cuLinkComplete) = load_sym(handle, "cuLinkComplete");
*(void**) (&_cuModuleUnload) = load_sym(handle, "cuModuleUnload");
*(void**) (&_cuLinkDestroy) = load_sym(handle, "cuLinkDestroy");
*(void**) (&_cuModuleLoadData) = load_sym(handle, "cuModuleLoadData");
*(void**) (&_cuLinkCreate) = load_sym(handle, "cuLinkCreate_v2");
*(void**) (&_cuModuleGetFunction) = load_sym(handle, "cuModuleGetFunction");
*(void**) (&_cuLinkAddFile) = load_sym(handle, "cuLinkAddFile_v2");
*(void**) (&_cuLinkAddData) = load_sym(handle, "cuLinkAddData_v2");
*(void**) (&_cuLaunchCooperativeKernel) = load_sym(handle, "cuLaunchCooperativeKernel");
*(void**) (&_cuLaunchKernel) = load_sym(handle, "cuLaunchKernel");
}

CUDADriverWrapper::~CUDADriverWrapper()
{
dllClose(handle);
}

CUresult CUDADriverWrapper::cuGetErrorName(CUresult error, const char** pStr) const
{
return (*_cuGetErrorName)(error, pStr);
}

CUresult CUDADriverWrapper::cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) const
{
return (*_cuFuncSetAttribute)(hfunc, attrib, value);
}

CUresult CUDADriverWrapper::cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut) const
{
return (*_cuLinkComplete)(state, cubinOut, sizeOut);
}

CUresult CUDADriverWrapper::cuModuleUnload(CUmodule hmod) const
{
return (*_cuModuleUnload)(hmod);
}

CUresult CUDADriverWrapper::cuLinkDestroy(CUlinkState state) const
{
return (*_cuLinkDestroy)(state);
}

CUresult CUDADriverWrapper::cuModuleLoadData(CUmodule* module, const void* image) const
{
return (*_cuModuleLoadData)(module, image);
}

CUresult CUDADriverWrapper::cuLinkCreate(
unsigned int numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut) const
{
return (*_cuLinkCreate)(numOptions, options, optionValues, stateOut);
}

CUresult CUDADriverWrapper::cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name) const
{
return (*_cuModuleGetFunction)(hfunc, hmod, name);
}

CUresult CUDADriverWrapper::cuLinkAddFile(CUlinkState state, CUjitInputType type, const char* path,
unsigned int numOptions, CUjit_option* options, void** optionValues) const
{
return (*_cuLinkAddFile)(state, type, path, numOptions, options, optionValues);
}

CUresult CUDADriverWrapper::cuLinkAddData(CUlinkState state, CUjitInputType type, void* data, size_t size,
const char* name, unsigned int numOptions, CUjit_option* options, void** optionValues) const
{
return (*_cuLinkAddData)(state, type, data, size, name, numOptions, options, optionValues);
}

CUresult CUDADriverWrapper::cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
unsigned int sharedMemBytes, CUstream hStream, void** kernelParams) const
{
return (*_cuLaunchCooperativeKernel)(
f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams);
}

CUresult CUDADriverWrapper::cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
unsigned int sharedMemBytes, CUstream hStream, void** kernelParams, void** extra) const
{
return (*_cuLaunchKernel)(
f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
}
101 changes: 101 additions & 0 deletions 3rdparty/common/cudaDriverWrapper.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef CUDA_DRIVER_WRAPPER_H
#define CUDA_DRIVER_WRAPPER_H

#include <cstdio>
#include <cuda.h>
#pragma once

#define cuErrCheck(stat, wrap) \
{ \
cuErrCheck_((stat), wrap, __FILE__, __LINE__); \
}

// namespace nvinfer1
// {
class CUDADriverWrapper
{
public:
CUDADriverWrapper();

~CUDADriverWrapper();

CUresult cuGetErrorName(CUresult error, const char** pStr) const;

CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) const;

CUresult cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut) const;

CUresult cuModuleUnload(CUmodule hmod) const;

CUresult cuLinkDestroy(CUlinkState state) const;

CUresult cuModuleLoadData(CUmodule* module, const void* image) const;

CUresult cuLinkCreate(
unsigned int numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut) const;

CUresult cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name) const;

CUresult cuLinkAddFile(CUlinkState state, CUjitInputType type, const char* path, unsigned int numOptions,
CUjit_option* options, void** optionValues) const;

CUresult cuLinkAddData(CUlinkState state, CUjitInputType type, void* data, size_t size, const char* name,
unsigned int numOptions, CUjit_option* options, void** optionValues) const;

CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
unsigned int sharedMemBytes, CUstream hStream, void** kernelParams) const;

CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes,
CUstream hStream, void** kernelParams, void** extra) const;

private:
void* handle;
CUresult (*_cuGetErrorName)(CUresult, const char**);
CUresult (*_cuFuncSetAttribute)(CUfunction, CUfunction_attribute, int);
CUresult (*_cuLinkComplete)(CUlinkState, void**, size_t*);
CUresult (*_cuModuleUnload)(CUmodule);
CUresult (*_cuLinkDestroy)(CUlinkState);
CUresult (*_cuLinkCreate)(unsigned int, CUjit_option*, void**, CUlinkState*);
CUresult (*_cuModuleLoadData)(CUmodule*, const void*);
CUresult (*_cuModuleGetFunction)(CUfunction*, CUmodule, const char*);
CUresult (*_cuLinkAddFile)(CUlinkState, CUjitInputType, const char*, unsigned int, CUjit_option*, void**);
CUresult (*_cuLinkAddData)(
CUlinkState, CUjitInputType, void*, size_t, const char*, unsigned int, CUjit_option*, void**);
CUresult (*_cuLaunchCooperativeKernel)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
unsigned int, unsigned int, unsigned int, CUstream, void**);
CUresult (*_cuLaunchKernel)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes,
CUstream hStream, void** kernelParams, void** extra);
};

inline void cuErrCheck_(CUresult stat, const CUDADriverWrapper& wrap, const char* file, int line)
{
if (stat != CUDA_SUCCESS)
{
const char* msg = nullptr;
wrap.cuGetErrorName(stat, &msg);
fprintf(stderr, "CUDA Error: %s %s %d\n", msg, file, line);
}
}

// } // namespace nvinfer1

#endif // CUDA_DRIVER_WRAPPER_H
26 changes: 26 additions & 0 deletions 3rdparty/fp8_qgmma_1x1/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

cmake_minimum_required(VERSION 3.8)

set(fp8_gemm_1x1_files
fp8_qgmma_1x1_utils.cu
)

file(GLOB fp8_gemm_1x1_files ${fp8_gemm_1x1_files} cubins/*.cubin.cpp)

add_library(fp8_qgmma_1x1_utils STATIC ${fp8_gemm_1x1_files})
target_link_libraries(fp8_qgmma_1x1_utils PUBLIC cuda_driver_wrapper)
set_property(TARGET fp8_qgmma_1x1_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET fp8_qgmma_1x1_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
24 changes: 24 additions & 0 deletions 3rdparty/fp8_qgmma_1x1/compute.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#include "fp8_gemm_1x1.h"

struct Compute {
struct Host {
ComputeParams _params;
__host__ Host() {}

__host__ ComputeParams params() { return _params; }

__host__ void configure(uint8_t* D, int N, int P, int Q, int C, int K, float ab_scale, float d_scale)
{
_params.D = D;
_params.N = N;
_params.NPQ = N*P*Q;
_params.PQ = P*Q;
_params.P = P;
_params.Q = Q;
_params.C = C;
_params.K = K;
_params.ab_scale = ab_scale;
_params.d_scale = d_scale;
}
};
};
Loading

0 comments on commit acb8091

Please sign in to comment.