diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index a6bace82e3..096cfcb4f1 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -505,7 +505,30 @@ bool LlamaBatch::generate() // T x = 999999.f; // cudaMemcpyAsync(decoder_input_buf_, &x, sizeof(x), cudaMemcpyDefault, stream_); - CheckValues(decoder_input_buf_, batch_size_ * llama_->hidden_units_, "embedding_lookup", stream_); + // CheckValues(decoder_input_buf_, batch_size_ * llama_->hidden_units_, "embedding_lookup", stream_); + + // if (compare_mode == kCmpWrite) { + // if (rank_ == 0) { + // Compare(decoder_input_buf_, llama_->hidden_units_, Concat("decoder_input", step_), compare_mode, + // stream_); + // } + // } + // else { + // for (int i = 0; i < batch_size_; ++i) { + // Compare(decoder_input_buf_ + i * llama_->hidden_units_, + // llama_->hidden_units_, + // Concat("decoder_input", step_), + // compare_mode, + // stream_, + // Concat("", rank_, i)); + // } + // } + // CheckBatchConsistency(decoder_input_buf_, // + // llama_->hidden_units_, + // batch_size_, + // Concat("decoder_input", step_), + // rank_, + // stream_); llama_->decoderForward(decoder_output_buf_, k_cache_ptr_buf_, @@ -519,12 +542,36 @@ bool LlamaBatch::generate() session_len_, batch_size_); + // CheckBatchConsistency(decoder_input_buf_, // + // llama_->hidden_units_, + // batch_size_, + // Concat("decoder_output", step_), + // rank_, + // stream_); + + // if (compare_mode == kCmpWrite) { + // if (rank_ == 0) { + // Compare(decoder_output_buf_, llama_->hidden_units_, Concat("decoder_output", step_), compare_mode, + // stream_); + // } + // } + // else { + // for (int i = 0; i < batch_size_; ++i) { + // Compare(decoder_output_buf_ + i * llama_->hidden_units_, + // llama_->hidden_units_, + // Concat("decoder_output", step_), + // compare_mode, + // stream_, + // Concat("", rank_, i)); + // } + // } + llama_->postDecodeEmbedding(logits_buf_, // local_logits_buf_, decoder_output_buf_, batch_size_); - CheckValues(logits_buf_, batch_size_ * llama_->vocab_size_padded_, "post_decode_embedding", stream_); + // CheckValues(logits_buf_, batch_size_ * llama_->vocab_size_padded_, "post_decode_embedding", stream_); // stop-words & bad-words require the matched tokens to be contiguous, so item size > 1 is // not supported yet. diff --git a/src/turbomind/models/llama/LlamaContextDecoder.cc b/src/turbomind/models/llama/LlamaContextDecoder.cc index 2e666da441..7edf087a8b 100644 --- a/src/turbomind/models/llama/LlamaContextDecoder.cc +++ b/src/turbomind/models/llama/LlamaContextDecoder.cc @@ -245,14 +245,14 @@ void LlamaContextDecoder::forward(std::unordered_map* stream_); sync_check_cuda_error(); - CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm", 0), stream_); + // CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm", 0), stream_); for (size_t layer = 0; layer < num_layer_; ++layer) { ///////////////////////////////////////////// /// self-attention forwardSelfAttn(sess, decoder_output, input_tensors, layer, false); - CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_self_attn", layer), stream_); + // CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_self_attn", layer), stream_); invokeFusedAddBiasResidualRMSNorm(decoder_input_output, decoder_output, @@ -264,7 +264,7 @@ void LlamaContextDecoder::forward(std::unordered_map* stream_); sync_check_cuda_error(); - CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm1", layer), stream_); + // CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm1", layer), stream_); //////////////////////////////////////////// /// feed-forward network @@ -273,7 +273,7 @@ void LlamaContextDecoder::forward(std::unordered_map* {"ffn_output", {MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, decoder_output}}}; silu_ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &decoder_layer_weights->at(layer)->ffn_weights); - CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_ffn", layer), stream_); + // CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_ffn", layer), stream_); auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights : input_tensors->at("output_norm_weight").getPtr(); @@ -287,7 +287,7 @@ void LlamaContextDecoder::forward(std::unordered_map* stream_); sync_check_cuda_error(); - CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm2", layer), stream_); + // CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm2", layer), stream_); } if (is_free_buffer_after_forward_) { diff --git a/src/turbomind/models/llama/LlamaDecoder.cc b/src/turbomind/models/llama/LlamaDecoder.cc index 30c82761bf..7b934f3cb2 100644 --- a/src/turbomind/models/llama/LlamaDecoder.cc +++ b/src/turbomind/models/llama/LlamaDecoder.cc @@ -195,6 +195,8 @@ void LlamaDecoder::forward(std::unordered_map* ou T* decoder_input = input_tensors->at("decoder_input").getPtr(); T* decoder_output = output_tensors->at("decoder_output").getPtr(); + // int step = input_tensors->at("step").getVal(); + //////////////////////////////////////////// /// RMSNorm invokeRootMeanSquareNorm(decoder_output, @@ -206,13 +208,27 @@ void LlamaDecoder::forward(std::unordered_map* ou stream_); sync_check_cuda_error(); - CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm", 0), stream_); + // CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm", 0), stream_); + + // CheckBatchConsistency(decoder_output, + // hidden_units_, + // sess.batch_size, + // Concat("decode_norm", step, 0), + // tensor_para_.rank_, + // stream_); for (size_t layer = 0; layer < num_layer_; ++layer) { // output: self_attn_output_, k_cache, v_cache = self_attn(decoder_normed_input_) forwardSelfAttn(sess, decoder_output, input_tensors, layer); - CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_self_attn", layer), stream_); + // CheckBatchConsistency(decoder_output, + // hidden_units_, + // sess.batch_size, + // Concat("decode_self_attn", step, layer), + // tensor_para_.rank_, + // stream_); + + // CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_self_attn", layer), stream_); invokeFusedAddBiasResidualRMSNorm(decoder_input, decoder_output, @@ -224,12 +240,12 @@ void LlamaDecoder::forward(std::unordered_map* ou stream_); sync_check_cuda_error(); - CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm1", layer), stream_); + // CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm1", layer), stream_); // decoder_layer_output_ = ffn(decoder_normed_input_) forwardFfn(sess, decoder_output, layer); - CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_ffn", layer), stream_); + // CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_ffn", layer), stream_); auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights : input_tensors->at("output_norm_weight").getPtr(); @@ -243,7 +259,7 @@ void LlamaDecoder::forward(std::unordered_map* ou stream_); sync_check_cuda_error(); - CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm2", layer), stream_); + // CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm2", layer), stream_); } if (is_free_buffer_after_forward_) { diff --git a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc index 103b32e88f..4eb7937ba6 100644 --- a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc +++ b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc @@ -25,6 +25,7 @@ #include "src/turbomind/models/llama/llama_utils.h" #include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/logger.h" +#include "src/turbomind/utils/nccl_utils.h" #include "src/turbomind/utils/nvtx_utils.h" #include // #include @@ -236,10 +237,24 @@ void LlamaDecoderSelfAttentionLayer::forward(TensorMap* o allocateBuffer(batch_size, step, max_seq_len); + // CheckBatchConsistency((T*)input_query_data, + // hidden_units_, + // batch_size, + // Concat("before_qkv_gemm", step, layer_id), + // tensor_para_.rank_, + // stream_); + PUSH_RANGE("qkv_gemm"); linear_.forward(qkv_buf_, input_query_data, batch_size, weights->qkv); POP_RANGE; + // CheckBatchConsistency(qkv_buf_, + // (local_head_num_ + 2 * local_kv_head_num_) * size_per_head_, + // batch_size, + // Concat("after_qkv_gemm", step, layer_id), + // tensor_para_.rank_, + // stream_); + const auto kv_cache_layer_offset = layer_id * local_kv_head_num_ * max_seq_len * size_per_head_; const int memory_len = max_seq_len; @@ -287,15 +302,38 @@ void LlamaDecoderSelfAttentionLayer::forward(TensorMap* o stream_); sync_check_cuda_error(); + // CheckBatchConsistency((T*)context_buf_, + // local_hidden_units_, + // batch_size, + // Concat("before_o_gemm", step, layer_id), + // tensor_para_.rank_, + // stream_); + linear_.forward(hidden_features_data, context_buf_, batch_size, weights->output); + // CheckBatchConsistency(hidden_features_data, + // hidden_units_, + // batch_size, + // Concat("after_o_gemm", step, layer_id), + // tensor_para_.rank_, + // stream_); + if (tensor_para_.world_size_ > 1) { NcclGuard nccl_guard(tensor_para_, stream_); ftNcclAllReduceSum( hidden_features_data, hidden_features_data, batch_size * hidden_units_, tensor_para_, stream_); sync_check_cuda_error(); + // ftNcclStreamSynchronize(tensor_para_, {}, stream_); + // sync_check_cuda_error(); } + // CheckBatchConsistency(hidden_features_data, + // hidden_units_, + // batch_size, + // Concat("self_attn_allreduce", step, layer_id), + // tensor_para_.rank_, + // stream_); + if (is_free_buffer_after_forward_) { freeBuffer(); } diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc index 77ffb609ca..97cf0e0a3d 100644 --- a/src/turbomind/models/llama/LlamaV2.cc +++ b/src/turbomind/models/llama/LlamaV2.cc @@ -438,8 +438,10 @@ void LlamaV2::internalThreadEntry(int device_id) TM_LOG_INFO("[internalThreadEntry] %d", (int)tensor_para_.rank_); check_cuda_error(cudaSetDevice(device_id)); + model_instance_barrier() = shared_state_->barrier.get(); + // initialize global counters - CheckValues((T*)0, 0, {}, 0); + // CheckValues((T*)0, 0, {}, 0); shared_state_->barrier->wait(); auto& request_queue = shared_state_->request_queue; diff --git a/src/turbomind/models/llama/llama_utils.cu b/src/turbomind/models/llama/llama_utils.cu index 77a99e8f77..4d409cde35 100644 --- a/src/turbomind/models/llama/llama_utils.cu +++ b/src/turbomind/models/llama/llama_utils.cu @@ -56,7 +56,7 @@ void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream) } template -void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream) +void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream, std::string msg) { // wait for b check_cuda_error(cudaStreamSynchronize(stream)); @@ -88,7 +88,7 @@ void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream) auto transform_iter = thrust::make_transform_iterator(zip_iter, abs_diff{}); // sum(abs(a - b)) auto asum = thrust::reduce(thrust::device, transform_iter, transform_iter + size); - std::cerr << key << ": " << asum << " " << asum / size << "\n"; + std::cerr << key << msg << ": " << asum << " " << asum / size << "\n"; } template @@ -106,11 +106,11 @@ void CmpWrite(T* ptr, size_t size, std::string key, cudaStream_t stream) } template -void Compare(T* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream) +void Compare(T* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream, std::string msg) { // std::cerr << "Comparing " << key << "\n"; if (mode == kCmpRead) { - CmpRead(ptr, size, key, stream); + CmpRead(ptr, size, key, stream, msg); } else if (mode == kCmpWrite) { CmpWrite(ptr, size, key, stream); @@ -120,9 +120,9 @@ void Compare(T* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t st } } -template void Compare(int* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream); -template void Compare(float* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream); -template void Compare(half* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream); +template void Compare(int* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream, std::string msg); +template void Compare(float* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream, std::string msg); +template void Compare(half* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream, std::string msg); template void CheckNan(const float* ptr, size_t size, std::string key, cudaStream_t stream); template void CheckNan(const half* ptr, size_t size, std::string key, cudaStream_t stream); @@ -257,4 +257,10 @@ void CheckValues(const T* data, int count, const std::string& msg, cudaStream_t template void CheckValues(const half* data, int count, const std::string& msg, cudaStream_t stream); template void CheckValues(const float* data, int count, const std::string& msg, cudaStream_t stream); +Barrier*& model_instance_barrier() +{ + thread_local Barrier* p{}; + return p; +} + } // namespace turbomind diff --git a/src/turbomind/models/llama/llama_utils.h b/src/turbomind/models/llama/llama_utils.h index cd007c9041..40a4688e01 100644 --- a/src/turbomind/models/llama/llama_utils.h +++ b/src/turbomind/models/llama/llama_utils.h @@ -1,6 +1,7 @@ // Copyright (c) OpenMMLab. All rights reserved. #pragma once +#include "src/turbomind/models/llama/Barrier.h" #include "src/turbomind/utils/Tensor.h" #include #include @@ -9,7 +10,8 @@ namespace turbomind { -enum QuantPolicy { +enum QuantPolicy +{ kNone = 0x00, // reserve 0x01 and 0x02 for backward compatibility kReserve1 = 0x01, @@ -18,7 +20,8 @@ enum QuantPolicy { kCacheKVInt8 = 0x04, }; -enum CmpMode { +enum CmpMode +{ kCmpNone, kCmpRead, kCmpWrite, @@ -27,7 +30,7 @@ enum CmpMode { extern CmpMode compare_mode; template -void Compare(T* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream); +void Compare(T* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream, std::string msg = {}); template void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream); @@ -50,7 +53,7 @@ inline std::string to_string(std::string x) template std::string Concat(std::string key, Args&&... args) { - std::vector args_str{detail::to_string((Args&&)args)...}; + std::vector args_str{detail::to_string((Args &&) args)...}; for (const auto& s : args_str) { key.append("_"); key.append(s); @@ -67,4 +70,28 @@ bool isDebug(); template void CheckValues(const T* data, int count, const std::string& msg, cudaStream_t stream); +Barrier*& model_instance_barrier(); + +template +inline void CheckBatchConsistency(T* ptr, size_t size, int batch_size, std::string key, int rank, cudaStream_t stream) +{ + if (compare_mode == kCmpNone) { + return; + } + model_instance_barrier()->wait(); + if (compare_mode == kCmpWrite) { + if (rank == 0) { + Compare(ptr, size, key, compare_mode, stream); + } + } + else { + if (rank == 0) { + for (int i = 0; i < batch_size; ++i) { + Compare(ptr + i * size, size, key, compare_mode, stream, Concat("", rank, i)); + } + } + } + model_instance_barrier()->wait(); +} + } // namespace turbomind