Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TurboMind 2 #590

Merged
merged 59 commits into from
Nov 10, 2023
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
82809c9
refresh decoder attention kernel
lzhangzz Sep 8, 2023
d1e1c48
block-level kv cache
lzhangzz Sep 11, 2023
a9ff3ce
`BlockManager` & `SequenceManager`
lzhangzz Sep 21, 2023
a7e31c5
update
lzhangzz Sep 25, 2023
3ed6176
update
lzhangzz Sep 25, 2023
79dab4c
update
lzhangzz Sep 28, 2023
ac8a50b
update
lzhangzz Oct 9, 2023
fac21cd
rename
lzhangzz Oct 9, 2023
a0f2450
GQA support
lzhangzz Oct 9, 2023
139f71d
fix context length
lzhangzz Oct 9, 2023
94a5d4a
GQA dispatch
lzhangzz Oct 9, 2023
68aa135
kv8
lzhangzz Oct 11, 2023
b269d53
tune
lzhangzz Oct 11, 2023
d7110e4
async stream cb
lzhangzz Oct 11, 2023
ce2f413
merge recent updates from upstream
lzhangzz Oct 12, 2023
498e9a3
nvtx
lzhangzz Oct 13, 2023
6d47a7a
config parsing
lzhangzz Oct 13, 2023
b49e84e
debug
lzhangzz Oct 13, 2023
b4e8bf1
optimize output cost
lzhangzz Oct 16, 2023
bdf0b41
split-k decoding
lzhangzz Oct 19, 2023
7a7e701
minor
lzhangzz Oct 19, 2023
48761d7
truncate `session_len` by available blocks
lzhangzz Oct 19, 2023
f9410a9
minor
lzhangzz Oct 19, 2023
96b7f4b
license
lzhangzz Oct 19, 2023
f8020e3
fix
lzhangzz Oct 19, 2023
90f5b8f
dispatch `cp.async`
lzhangzz Oct 20, 2023
0fe3ab9
fix linking
lzhangzz Oct 20, 2023
333ce08
fix
lzhangzz Oct 20, 2023
abaca3e
fix deadlock
lzhangzz Oct 20, 2023
79686aa
Merge remote-tracking branch 'origin/main' into tm2
lzhangzz Oct 20, 2023
290e087
guard input length
lzhangzz Oct 20, 2023
ca70033
correct start offset
lzhangzz Oct 20, 2023
32037fd
fix prefill chunking
lzhangzz Oct 20, 2023
0313866
fix `cache_block_seq_len` param passing
lzhangzz Oct 21, 2023
b70a4f6
fix `block_size` fmtstr
lzhangzz Oct 21, 2023
2290461
fix output tokens
lzhangzz Oct 21, 2023
66fa64b
fix batch resizing
lzhangzz Oct 23, 2023
18001cd
fix masking of finished sequences
lzhangzz Oct 23, 2023
8705131
add debug util
lzhangzz Oct 23, 2023
64de1cd
free unused block early
lzhangzz Oct 25, 2023
699b0bf
add ntk scaling and logn scaling
lzhangzz Oct 30, 2023
2e08a0b
cmake flags
lzhangzz Oct 31, 2023
44782a1
fix typo
lzhangzz Nov 2, 2023
39c1a87
w4a16 for sm75
lzhangzz Nov 2, 2023
c8eedef
fix msvc build
lzhangzz Nov 2, 2023
6de4a37
fix msvc build
lzhangzz Nov 2, 2023
86f60c3
fix block verification
lzhangzz Nov 2, 2023
bce90b3
fix msvc build
lzhangzz Nov 2, 2023
683b1b9
use `std::shuffle`
lzhangzz Nov 2, 2023
5563b26
fix lint
lzhangzz Nov 2, 2023
8936413
fix lint
lzhangzz Nov 2, 2023
bd6b89c
fix lint
lzhangzz Nov 2, 2023
8c8d8bf
clear incoming buffer
lzhangzz Nov 6, 2023
d3a1356
clear finished requests
lzhangzz Nov 7, 2023
55dcb8b
fix batch initialization
lzhangzz Nov 7, 2023
b7bf3d7
fix typo
lzhangzz Nov 7, 2023
efe06ea
Merge remote-tracking branch 'origin/main' into tm2
lzhangzz Nov 8, 2023
6b1c38b
fix typo
lzhangzz Nov 9, 2023
15b4921
fix comparison
lzhangzz Nov 9, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,22 @@ option(SPARSITY_SUPPORT "Build project with Ampere sparsity feature support" OFF

option(BUILD_FAST_MATH "Build in fast math mode" ON)

# the environment variable
# ASAN_OPTIONS=protect_shadow_gap=0,intercept_tls_get_addr=0
# must be set at runtime
# https://github.com/google/sanitizers/issues/1322
if (LMDEPLOY_ASAN_ENABLE)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-fsanitize=address>)
add_link_options(-fsanitize=address)
endif ()

# notice that ubsan has linker issues for ubuntu < 18.04, see
# https://stackoverflow.com/questions/50024731/ld-unrecognized-option-push-state-no-as-needed
if (LMDEPLOY_UBSAN_ENABLE)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-fsanitize=undefined>)
add_link_options(-fsanitize=undefined)
endif ()

if(BUILD_MULTI_GPU)
message(STATUS "Add DBUILD_MULTI_GPU, requires MPI and NCCL")
add_definitions("-DBUILD_MULTI_GPU")
Expand Down
35 changes: 16 additions & 19 deletions examples/cpp/llama/llama_triton_example.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,9 @@ broadCastRequest(const std::vector<int>& v_start_ids,
if (node_id == 0) {
memcpy(v_input_ids.data(), v_start_ids.data(), size_1 * sizeof(int));
memcpy(v_input_lengths.data(), v_start_lengths.data(), size_2 * sizeof(int));
memcpy(v_input_bad_words.data(), v_bad_words.data(), size_bad_words * sizeof(int));
if (!v_input_bad_words.empty()) {
memcpy(v_input_bad_words.data(), v_bad_words.data(), size_bad_words * sizeof(int));
}
}
if (kUSE_MPI) {
ft::mpi::barrier();
Expand Down Expand Up @@ -431,6 +433,8 @@ int main(int argc, char* argv[])
const int beam_width = output_tensors_lists[0].get()->at("output_ids").shape[1];
const int seq_len = output_tensors_lists[0].get()->at("output_ids").shape[2];

ft::FT_CHECK(beam_width == 1);

std::vector<int> seq_lens(batch_size);
// step 6: check results
if (node_id == 0) {
Expand All @@ -440,32 +444,25 @@ int main(int argc, char* argv[])
printf("[WARNING] Cannot write results into output file %s \n", fName.c_str());
}
else {
size_t outCount = batch_size * beam_width * seq_len;
// int* hBuf = new int[outCount];
const size_t outCount = batch_size * beam_width * seq_len;

std::vector<int> hBuf(outCount);

ft::cudaD2Hcpy(hBuf.data(), d_output_ids, outCount);
ft::cudaD2Hcpy(seq_lens.data(), d_seq_lens, batch_size);

std::cout << "sequence length: ";
for (int i = 0; i < batch_size; ++i) {
std::cout << (i ? ", " : "") << seq_lens[i];
}
std::cout << "\n";
{
std::cout << "Writing " << outCount << " elements\n";
int zeroCount = 0;
for (size_t i = 0; i < outCount; i++) {
if (hBuf[i] == int(0))
zeroCount++;
outFile << hBuf[i] << " ";
if ((i + 1) % (seq_len) == 0)
outFile << std::endl;

if (i < 10)
printf("%5d ", hBuf[i]);
if ((i + 1) % (seq_len) == 0 && i < 10)
std::cout << std::endl;

for (int i = 0; i < batch_size; ++i) {
outFile << (i ? "\n" : "");
auto buf = hBuf.data() + seq_len * i;
for (int j = 0; j < seq_lens[i]; ++j) {
outFile << buf[j] << " ";
}
std::cout << std::endl << "zeroCount = " << zeroCount << std::endl;
}
}
}
Expand All @@ -475,7 +472,7 @@ int main(int argc, char* argv[])
}
cudaDeviceSynchronize();

if (1) {
if (0) {
// test time
auto start = std::chrono::high_resolution_clock::now();

Expand Down
1 change: 1 addition & 0 deletions src/turbomind/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,4 @@ set_property(TARGET custom_ar_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET custom_ar_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)

add_subdirectory(gemm_s_f16)
add_subdirectory(decoder_multihead_attention)
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@

////////////////////////////////////////////////////////////////////////////////////////////////////

// cudaFuncAttributes attr{}; \
// cudaFuncGetAttributes(&attr, func); \
// std::cout << "static_smem_sz: " << attr.sharedSizeBytes << std::endl; \
// std::cout << "max_dynamic_smem: " << attr.maxDynamicSharedSizeBytes << std::endl; \
// std::cout << "dynamic_smem_sz: " << smem_sz << std::endl; \

template<typename T, int Dh, int Dh_MAX, typename KERNEL_PARAMS_TYPE>
void mmha_launch_kernel(const KERNEL_PARAMS_TYPE& params, const cudaStream_t& stream)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,7 @@ namespace mmha {
////////////////////////////////////////////////////////////////////////////////////////////////////

template<typename T, int Dh>
struct Qk_vec_m_ {
};
struct Qk_vec_m_ {};

template<>
struct Qk_vec_m_<float, 32> {
Expand Down Expand Up @@ -180,8 +179,7 @@ struct Qk_vec_k_<__nv_fp8_e4m3, 256> {
////////////////////////////////////////////////////////////////////////////////////////////////////

template<typename T, int THREADS_PER_KEY>
struct K_vec_m_ {
};
struct K_vec_m_ {};

template<>
struct K_vec_m_<float, 4> {
Expand Down Expand Up @@ -262,8 +260,7 @@ struct K_vec_k_<__nv_fp8_e4m3, 1> {
////////////////////////////////////////////////////////////////////////////////////////////////////

template<typename T, int V_VEC_SIZE>
struct V_vec_m_ {
};
struct V_vec_m_ {};

template<>
struct V_vec_m_<float, 1> {
Expand Down Expand Up @@ -343,8 +340,7 @@ struct V_vec_k_<__nv_fp8_e4m3, 16> {

#ifdef MMHA_USE_FP32_ACUM_FOR_FMA
template<typename T>
struct Qk_vec_acum_fp32_ {
};
struct Qk_vec_acum_fp32_ {};

template<>
struct Qk_vec_acum_fp32_<float> {
Expand Down Expand Up @@ -426,8 +422,7 @@ struct Qk_vec_acum_fp32_<fp8_4_t> {
////////////////////////////////////////////////////////////////////////////////////////////////////

template<typename T>
struct K_vec_acum_fp32_ {
};
struct K_vec_acum_fp32_ {};

template<>
struct K_vec_acum_fp32_<float> {
Expand Down Expand Up @@ -489,8 +484,7 @@ struct K_vec_acum_fp32_<fp8_4_t> {

#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
template<typename T>
struct V_vec_acum_fp32_ {
};
struct V_vec_acum_fp32_ {};

template<>
struct V_vec_acum_fp32_<float> {
Expand Down Expand Up @@ -1472,6 +1466,8 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
}
// We don't need to apply the linear position bias here since qi - ki = 0 yields the position bias 0.

printf("QK_last[%d] = %f\n", hi, qk);

qk_max = qk;
qk_smem[tlength - first_step] = qk;
// qk_smem[params.timestep] = qk;
Expand Down Expand Up @@ -1596,6 +1592,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>

qk += mul<float, T, float>(params.linear_bias_slopes[hi], dist);
}
// printf("QK_%d = %f\n", (int)ti, qk);
qk_max = is_mask ? qk_max : fmaxf(qk_max, qk);
qk_smem[ti - first_step] = qk;
}
Expand Down Expand Up @@ -1632,6 +1629,10 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
// Broadcast to all the threads in the warp.
qk_max = __shfl_sync(uint32_t(-1), qk_max, 0);

if (threadIdx.x == 0) {
printf("QK_MAX[%d] = %f\n", hi, (float)qk_max);
}

// Compute the logits and start the sum.
float sum = 0.f;
// for( int ti = tidx; ti <= params.timestep; ti += THREADS_PER_BLOCK ) {
Expand All @@ -1657,6 +1658,10 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
// Compute the sum.
sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], sum);

if (threadIdx.x == 0) {
printf("SUM[%d] = %f\n", hi, (float)sum);
}

// Normalize the logits.
float inv_sum = __fdividef(1.f, sum + 1.e-6f);

Expand Down
Loading
Loading