Skip to content

Commit

Permalink
remove debug code
Browse files Browse the repository at this point in the history
  • Loading branch information
lzhangzz committed Nov 28, 2024
1 parent 90d2529 commit f565ef7
Show file tree
Hide file tree
Showing 2 changed files with 0 additions and 63 deletions.
18 changes: 0 additions & 18 deletions src/turbomind/models/llama/unified_attention_layer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -428,24 +428,6 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa

count_and_fix(qkv_buf_3_, token_num * weights->output.input_dims, Concat("attn", layer_id), 3);

#if 0
if (!isTuning()) {
T* o{};
cudaMallocAsync(&o, sizeof(T) * token_num * head_num_ * size_per_head_, stream_);
cudaMemsetAsync(o, 0, sizeof(T) * token_num * head_num_ * size_per_head_, stream_);
auto dst = o;
auto src = qkv_buf_3_;
for (int i = 0; i < token_num; ++i) {
for (int j = 0; j < head_num_; ++j) {
cudaMemcpyAsync(dst, src, sizeof(T) * 128, cudaMemcpyDefault, stream_);
src += 192;
dst += 128;
}
}
Compare(o, token_num * head_num_ * 128, "attn", kCmpRead, stream_);
}
#endif

//////////////////////////////////////////////
/// output gemm <Bs,HD> -> <Bs,HD>
linear_->forward(attention_out, qkv_buf_3_, token_num, weights->output, LlamaLinear<T>::kGemm, lora_mask);
Expand Down
45 changes: 0 additions & 45 deletions src/turbomind/models/llama/unified_decoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -147,16 +147,6 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con

const int pf_offset = dc_batch_size;

const bool flag = false && !isTuning();

// Compare(decoder_input_output, token_num * hidden_units_, "decoder_input", kCmpRead, stream_);

// printf("%d %f\n", (int)token_num, rmsnorm_eps_);

if (flag) {
Compare(decoder_input_output, token_num * hidden_units_, "norm0", kCmpRead, stream_);
}

/////////////////////////////////////////////
/// RMSNorm
invokeRMSNorm(decoder_output,
Expand All @@ -177,10 +167,6 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
continue;
}

if (flag) {
Compare(decoder_output, token_num * hidden_units_, "attn_input", kCmpRead, stream_);
}

/////////////////////////////////////////////
/// self-attention
forwardSelfAttn(decoder_output, //
Expand All @@ -193,11 +179,6 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con

count_and_fix(decoder_output, token_num * hidden_units_, Concat("attn_block", layer), 2);

if (flag) {
Compare(decoder_input_output, token_num * hidden_units_, "res0", kCmpRead, stream_);
Compare(decoder_output, token_num * hidden_units_, "attn_out", kCmpRead, stream_);
}

invokeBiasResidualRMSNorm(decoder_input_output,
decoder_output,
weights->at(layer)->ffn_norm_weights,
Expand All @@ -208,21 +189,12 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
stream_);
sync_check_cuda_error();

if (flag) {
Compare(decoder_input_output, token_num * hidden_units_, "res1", kCmpRead, stream_);
Compare(decoder_output, token_num * hidden_units_, "ffn_in", kCmpRead, stream_);
}

count_and_fix(decoder_input_output, token_num * hidden_units_, Concat("residual0", layer), 2);
count_and_fix(decoder_output, token_num * hidden_units_, Concat("norm1", layer), 2);

////////////////////////////////////////////
/// feed-forward network

// if (tp_.rank_ == 0) {
// Compare(decoder_output, token_num * hidden_units_, Concat("ffn_input", layer), compare_mode, stream_);
// }

const bool is_moe = !weights->at(layer)->moe_weights.experts.empty();
if (is_moe) {
moe_ffn_layer_->forward(nullptr, decoder_output, token_num, layer, weights->at(layer)->moe_weights);
Expand All @@ -241,22 +213,10 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &weights->at(layer)->ffn_weights);
}

// if (tp_.rank_ == 0) {
// Compare(decoder_output, token_num * hidden_units_, Concat("ffn_out", layer), compare_mode, stream_);
// }

if (is_moe) {
moe_ffn_layer_->reduce(decoder_output, token_num, (bool)ffn_layer_, layer, weights->at(layer)->moe_weights);
}

if (flag) {
Compare(decoder_output, token_num * hidden_units_, "ffn_out", kCmpRead, stream_);
}

// if (tp_.rank_ == 0) {
// Compare(decoder_output, token_num * hidden_units_, Concat("moe_ffn_out", layer), compare_mode, stream_);
// }

count_and_fix(decoder_output, token_num * hidden_units_, Concat("ffn_block", layer), 2);

const bool is_last_layer = layer == layer_num_ - 1;
Expand All @@ -275,11 +235,6 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con

count_and_fix(decoder_input_output, token_num * hidden_units_, Concat("residual1", layer), 2);
count_and_fix(decoder_output, token_num * hidden_units_, Concat("norm0", layer + 1), 2);

if (flag) {
cudaStreamSynchronize(stream_);
std::abort();
}
}

if (dc_batch_size) {
Expand Down

0 comments on commit f565ef7

Please sign in to comment.