diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc
index a6bace82e3..096cfcb4f1 100644
--- a/src/turbomind/models/llama/LlamaBatch.cc
+++ b/src/turbomind/models/llama/LlamaBatch.cc
@@ -505,7 +505,30 @@ bool LlamaBatch<T>::generate()
     // T x = 999999.f;
     // cudaMemcpyAsync(decoder_input_buf_, &x, sizeof(x), cudaMemcpyDefault, stream_);
 
-    CheckValues(decoder_input_buf_, batch_size_ * llama_->hidden_units_, "embedding_lookup", stream_);
+    // CheckValues(decoder_input_buf_, batch_size_ * llama_->hidden_units_, "embedding_lookup", stream_);
+
+    // if (compare_mode == kCmpWrite) {
+    //     if (rank_ == 0) {
+    //         Compare(decoder_input_buf_, llama_->hidden_units_, Concat("decoder_input", step_), compare_mode,
+    //         stream_);
+    //     }
+    // }
+    // else {
+    //     for (int i = 0; i < batch_size_; ++i) {
+    //         Compare(decoder_input_buf_ + i * llama_->hidden_units_,
+    //                 llama_->hidden_units_,
+    //                 Concat("decoder_input", step_),
+    //                 compare_mode,
+    //                 stream_,
+    //                 Concat("", rank_, i));
+    //     }
+    // }
+    // CheckBatchConsistency(decoder_input_buf_,  //
+    //                       llama_->hidden_units_,
+    //                       batch_size_,
+    //                       Concat("decoder_input", step_),
+    //                       rank_,
+    //                       stream_);
 
     llama_->decoderForward(decoder_output_buf_,
                            k_cache_ptr_buf_,
@@ -519,12 +542,36 @@ bool LlamaBatch<T>::generate()
                            session_len_,
                            batch_size_);
 
+    // CheckBatchConsistency(decoder_input_buf_,  //
+    //                       llama_->hidden_units_,
+    //                       batch_size_,
+    //                       Concat("decoder_output", step_),
+    //                       rank_,
+    //                       stream_);
+
+    // if (compare_mode == kCmpWrite) {
+    //     if (rank_ == 0) {
+    //         Compare(decoder_output_buf_, llama_->hidden_units_, Concat("decoder_output", step_), compare_mode,
+    //         stream_);
+    //     }
+    // }
+    // else {
+    //     for (int i = 0; i < batch_size_; ++i) {
+    //         Compare(decoder_output_buf_ + i * llama_->hidden_units_,
+    //                 llama_->hidden_units_,
+    //                 Concat("decoder_output", step_),
+    //                 compare_mode,
+    //                 stream_,
+    //                 Concat("", rank_, i));
+    //     }
+    // }
+
     llama_->postDecodeEmbedding(logits_buf_,  //
                                 local_logits_buf_,
                                 decoder_output_buf_,
                                 batch_size_);
 
-    CheckValues(logits_buf_, batch_size_ * llama_->vocab_size_padded_, "post_decode_embedding", stream_);
+    // CheckValues(logits_buf_, batch_size_ * llama_->vocab_size_padded_, "post_decode_embedding", stream_);
 
     // stop-words & bad-words require the matched tokens to be contiguous, so item size > 1 is
     // not supported yet.
diff --git a/src/turbomind/models/llama/LlamaContextDecoder.cc b/src/turbomind/models/llama/LlamaContextDecoder.cc
index 2e666da441..7edf087a8b 100644
--- a/src/turbomind/models/llama/LlamaContextDecoder.cc
+++ b/src/turbomind/models/llama/LlamaContextDecoder.cc
@@ -245,14 +245,14 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                              stream_);
     sync_check_cuda_error();
 
-    CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm", 0), stream_);
+    // CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm", 0), stream_);
 
     for (size_t layer = 0; layer < num_layer_; ++layer) {
         /////////////////////////////////////////////
         /// self-attention
         forwardSelfAttn(sess, decoder_output, input_tensors, layer, false);
 
-        CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_self_attn", layer), stream_);
+        // CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_self_attn", layer), stream_);
 
         invokeFusedAddBiasResidualRMSNorm(decoder_input_output,
                                           decoder_output,
@@ -264,7 +264,7 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                           stream_);
         sync_check_cuda_error();
 
-        CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm1", layer), stream_);
+        // CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm1", layer), stream_);
 
         ////////////////////////////////////////////
         /// feed-forward network
@@ -273,7 +273,7 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             {"ffn_output", {MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, decoder_output}}};
         silu_ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &decoder_layer_weights->at(layer)->ffn_weights);
 
-        CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_ffn", layer), stream_);
+        // CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_ffn", layer), stream_);
 
         auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights :
                                                      input_tensors->at("output_norm_weight").getPtr<T>();
@@ -287,7 +287,7 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                           stream_);
         sync_check_cuda_error();
 
-        CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm2", layer), stream_);
+        // CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm2", layer), stream_);
     }
 
     if (is_free_buffer_after_forward_) {
diff --git a/src/turbomind/models/llama/LlamaDecoder.cc b/src/turbomind/models/llama/LlamaDecoder.cc
index 30c82761bf..7b934f3cb2 100644
--- a/src/turbomind/models/llama/LlamaDecoder.cc
+++ b/src/turbomind/models/llama/LlamaDecoder.cc
@@ -195,6 +195,8 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*        ou
     T* decoder_input  = input_tensors->at("decoder_input").getPtr<T>();
     T* decoder_output = output_tensors->at("decoder_output").getPtr<T>();
 
+    // int step = input_tensors->at("step").getVal<int>();
+
     ////////////////////////////////////////////
     /// RMSNorm
     invokeRootMeanSquareNorm(decoder_output,
@@ -206,13 +208,27 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*        ou
                              stream_);
     sync_check_cuda_error();
 
-    CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm", 0), stream_);
+    // CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm", 0), stream_);
+
+    // CheckBatchConsistency(decoder_output,
+    //                       hidden_units_,
+    //                       sess.batch_size,
+    //                       Concat("decode_norm", step, 0),
+    //                       tensor_para_.rank_,
+    //                       stream_);
 
     for (size_t layer = 0; layer < num_layer_; ++layer) {
         // output: self_attn_output_, k_cache, v_cache = self_attn(decoder_normed_input_)
         forwardSelfAttn(sess, decoder_output, input_tensors, layer);
 
-        CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_self_attn", layer), stream_);
+        // CheckBatchConsistency(decoder_output,
+        //                       hidden_units_,
+        //                       sess.batch_size,
+        //                       Concat("decode_self_attn", step, layer),
+        //                       tensor_para_.rank_,
+        //                       stream_);
+
+        // CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_self_attn", layer), stream_);
 
         invokeFusedAddBiasResidualRMSNorm(decoder_input,
                                           decoder_output,
@@ -224,12 +240,12 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*        ou
                                           stream_);
         sync_check_cuda_error();
 
-        CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm1", layer), stream_);
+        // CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm1", layer), stream_);
 
         // decoder_layer_output_ = ffn(decoder_normed_input_)
         forwardFfn(sess, decoder_output, layer);
 
-        CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_ffn", layer), stream_);
+        // CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_ffn", layer), stream_);
 
         auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights :
                                                      input_tensors->at("output_norm_weight").getPtr<T>();
@@ -243,7 +259,7 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*        ou
                                           stream_);
         sync_check_cuda_error();
 
-        CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm2", layer), stream_);
+        // CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm2", layer), stream_);
     }
 
     if (is_free_buffer_after_forward_) {
diff --git a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
index 103b32e88f..4eb7937ba6 100644
--- a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
+++ b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
@@ -25,6 +25,7 @@
 #include "src/turbomind/models/llama/llama_utils.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/logger.h"
+#include "src/turbomind/utils/nccl_utils.h"
 #include "src/turbomind/utils/nvtx_utils.h"
 #include <string>
 // #include <glog/logging.h>
@@ -236,10 +237,24 @@ void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap*                     o
 
     allocateBuffer(batch_size, step, max_seq_len);
 
+    // CheckBatchConsistency((T*)input_query_data,
+    //                       hidden_units_,
+    //                       batch_size,
+    //                       Concat("before_qkv_gemm", step, layer_id),
+    //                       tensor_para_.rank_,
+    //                       stream_);
+
     PUSH_RANGE("qkv_gemm");
     linear_.forward(qkv_buf_, input_query_data, batch_size, weights->qkv);
     POP_RANGE;
 
+    // CheckBatchConsistency(qkv_buf_,
+    //                       (local_head_num_ + 2 * local_kv_head_num_) * size_per_head_,
+    //                       batch_size,
+    //                       Concat("after_qkv_gemm", step, layer_id),
+    //                       tensor_para_.rank_,
+    //                       stream_);
+
     const auto kv_cache_layer_offset = layer_id * local_kv_head_num_ * max_seq_len * size_per_head_;
     const int  memory_len            = max_seq_len;
 
@@ -287,15 +302,38 @@ void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap*                     o
         stream_);
     sync_check_cuda_error();
 
+    // CheckBatchConsistency((T*)context_buf_,
+    //                       local_hidden_units_,
+    //                       batch_size,
+    //                       Concat("before_o_gemm", step, layer_id),
+    //                       tensor_para_.rank_,
+    //                       stream_);
+
     linear_.forward(hidden_features_data, context_buf_, batch_size, weights->output);
 
+    // CheckBatchConsistency(hidden_features_data,
+    //                       hidden_units_,
+    //                       batch_size,
+    //                       Concat("after_o_gemm", step, layer_id),
+    //                       tensor_para_.rank_,
+    //                       stream_);
+
     if (tensor_para_.world_size_ > 1) {
         NcclGuard nccl_guard(tensor_para_, stream_);
         ftNcclAllReduceSum(
             hidden_features_data, hidden_features_data, batch_size * hidden_units_, tensor_para_, stream_);
         sync_check_cuda_error();
+        // ftNcclStreamSynchronize(tensor_para_, {}, stream_);
+        // sync_check_cuda_error();
     }
 
+    // CheckBatchConsistency(hidden_features_data,
+    //                       hidden_units_,
+    //                       batch_size,
+    //                       Concat("self_attn_allreduce", step, layer_id),
+    //                       tensor_para_.rank_,
+    //                       stream_);
+
     if (is_free_buffer_after_forward_) {
         freeBuffer();
     }
diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc
index 77ffb609ca..97cf0e0a3d 100644
--- a/src/turbomind/models/llama/LlamaV2.cc
+++ b/src/turbomind/models/llama/LlamaV2.cc
@@ -438,8 +438,10 @@ void LlamaV2<T>::internalThreadEntry(int device_id)
     TM_LOG_INFO("[internalThreadEntry] %d", (int)tensor_para_.rank_);
     check_cuda_error(cudaSetDevice(device_id));
 
+    model_instance_barrier() = shared_state_->barrier.get();
+
     // initialize global counters
-    CheckValues((T*)0, 0, {}, 0);
+    // CheckValues((T*)0, 0, {}, 0);
     shared_state_->barrier->wait();
 
     auto& request_queue  = shared_state_->request_queue;
diff --git a/src/turbomind/models/llama/llama_utils.cu b/src/turbomind/models/llama/llama_utils.cu
index 77a99e8f77..4d409cde35 100644
--- a/src/turbomind/models/llama/llama_utils.cu
+++ b/src/turbomind/models/llama/llama_utils.cu
@@ -56,7 +56,7 @@ void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream)
 }
 
 template<typename T>
-void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream)
+void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream, std::string msg)
 {
     // wait for b
     check_cuda_error(cudaStreamSynchronize(stream));
@@ -88,7 +88,7 @@ void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream)
     auto                  transform_iter = thrust::make_transform_iterator(zip_iter, abs_diff<T>{});
     // sum(abs(a - b))
     auto asum = thrust::reduce(thrust::device, transform_iter, transform_iter + size);
-    std::cerr << key << ": " << asum << " " << asum / size << "\n";
+    std::cerr << key << msg << ": " << asum << " " << asum / size << "\n";
 }
 
 template<typename T>
@@ -106,11 +106,11 @@ void CmpWrite(T* ptr, size_t size, std::string key, cudaStream_t stream)
 }
 
 template<typename T>
-void Compare(T* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream)
+void Compare(T* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream, std::string msg)
 {
     // std::cerr << "Comparing " << key << "\n";
     if (mode == kCmpRead) {
-        CmpRead(ptr, size, key, stream);
+        CmpRead(ptr, size, key, stream, msg);
     }
     else if (mode == kCmpWrite) {
         CmpWrite(ptr, size, key, stream);
@@ -120,9 +120,9 @@ void Compare(T* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t st
     }
 }
 
-template void Compare(int* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream);
-template void Compare(float* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream);
-template void Compare(half* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream);
+template void Compare(int* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream, std::string msg);
+template void Compare(float* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream, std::string msg);
+template void Compare(half* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream, std::string msg);
 
 template void CheckNan(const float* ptr, size_t size, std::string key, cudaStream_t stream);
 template void CheckNan(const half* ptr, size_t size, std::string key, cudaStream_t stream);
@@ -257,4 +257,10 @@ void CheckValues(const T* data, int count, const std::string& msg, cudaStream_t
 template void CheckValues(const half* data, int count, const std::string& msg, cudaStream_t stream);
 template void CheckValues(const float* data, int count, const std::string& msg, cudaStream_t stream);
 
+Barrier*& model_instance_barrier()
+{
+    thread_local Barrier* p{};
+    return p;
+}
+
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/llama_utils.h b/src/turbomind/models/llama/llama_utils.h
index cd007c9041..40a4688e01 100644
--- a/src/turbomind/models/llama/llama_utils.h
+++ b/src/turbomind/models/llama/llama_utils.h
@@ -1,6 +1,7 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
 #pragma once
+#include "src/turbomind/models/llama/Barrier.h"
 #include "src/turbomind/utils/Tensor.h"
 #include <cuda_runtime.h>
 #include <sstream>
@@ -9,7 +10,8 @@
 
 namespace turbomind {
 
-enum QuantPolicy {
+enum QuantPolicy
+{
     kNone = 0x00,
     // reserve 0x01 and 0x02 for backward compatibility
     kReserve1 = 0x01,
@@ -18,7 +20,8 @@ enum QuantPolicy {
     kCacheKVInt8 = 0x04,
 };
 
-enum CmpMode {
+enum CmpMode
+{
     kCmpNone,
     kCmpRead,
     kCmpWrite,
@@ -27,7 +30,7 @@ enum CmpMode {
 extern CmpMode compare_mode;
 
 template<typename T>
-void Compare(T* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream);
+void Compare(T* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream, std::string msg = {});
 
 template<typename T>
 void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream);
@@ -50,7 +53,7 @@ inline std::string to_string(std::string x)
 template<typename... Args>
 std::string Concat(std::string key, Args&&... args)
 {
-    std::vector<std::string> args_str{detail::to_string((Args&&)args)...};
+    std::vector<std::string> args_str{detail::to_string((Args &&) args)...};
     for (const auto& s : args_str) {
         key.append("_");
         key.append(s);
@@ -67,4 +70,28 @@ bool isDebug();
 template<typename T>
 void CheckValues(const T* data, int count, const std::string& msg, cudaStream_t stream);
 
+Barrier*& model_instance_barrier();
+
+template<typename T>
+inline void CheckBatchConsistency(T* ptr, size_t size, int batch_size, std::string key, int rank, cudaStream_t stream)
+{
+    if (compare_mode == kCmpNone) {
+        return;
+    }
+    model_instance_barrier()->wait();
+    if (compare_mode == kCmpWrite) {
+        if (rank == 0) {
+            Compare(ptr, size, key, compare_mode, stream);
+        }
+    }
+    else {
+        if (rank == 0) {
+            for (int i = 0; i < batch_size; ++i) {
+                Compare(ptr + i * size, size, key, compare_mode, stream, Concat("", rank, i));
+            }
+        }
+    }
+    model_instance_barrier()->wait();
+}
+
 }  // namespace turbomind