InternLM · lvhan028 · Nov 10, 2023 · Sep 8, 2023 · Sep 11, 2023 · Sep 21, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -61,6 +61,22 @@ option(SPARSITY_SUPPORT "Build project with Ampere sparsity feature support" OFF
 
 option(BUILD_FAST_MATH "Build in fast math mode" ON)
 
+# the environment variable 
+#   ASAN_OPTIONS=protect_shadow_gap=0,intercept_tls_get_addr=0
+# must be set at runtime
+# https://github.com/google/sanitizers/issues/1322
+if (LMDEPLOY_ASAN_ENABLE)
+    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-fsanitize=address>)
+    add_link_options(-fsanitize=address)
+endif ()
+
+# notice that ubsan has linker issues for ubuntu < 18.04, see
+# https://stackoverflow.com/questions/50024731/ld-unrecognized-option-push-state-no-as-needed
+if (LMDEPLOY_UBSAN_ENABLE)
+    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-fsanitize=undefined>)
+    add_link_options(-fsanitize=undefined)
+endif ()
+
 if(BUILD_MULTI_GPU)
   message(STATUS "Add DBUILD_MULTI_GPU, requires MPI and NCCL")
   add_definitions("-DBUILD_MULTI_GPU")

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
@@ -80,7 +80,9 @@ broadCastRequest(const std::vector<int>& v_start_ids,
     if (node_id == 0) {
         memcpy(v_input_ids.data(), v_start_ids.data(), size_1 * sizeof(int));
         memcpy(v_input_lengths.data(), v_start_lengths.data(), size_2 * sizeof(int));
-        memcpy(v_input_bad_words.data(), v_bad_words.data(), size_bad_words * sizeof(int));
+        if (!v_input_bad_words.empty()) {
+            memcpy(v_input_bad_words.data(), v_bad_words.data(), size_bad_words * sizeof(int));
+        }
     }
     if (kUSE_MPI) {
         ft::mpi::barrier();
@@ -431,6 +433,8 @@ int main(int argc, char* argv[])
     const int  beam_width   = output_tensors_lists[0].get()->at("output_ids").shape[1];
     const int  seq_len      = output_tensors_lists[0].get()->at("output_ids").shape[2];
 
+    ft::FT_CHECK(beam_width == 1);
+
     std::vector<int> seq_lens(batch_size);
     // step 6: check results
     if (node_id == 0) {
@@ -440,32 +444,25 @@ int main(int argc, char* argv[])
             printf("[WARNING] Cannot write results into output file %s \n", fName.c_str());
         }
         else {
-            size_t outCount = batch_size * beam_width * seq_len;
-            // int*   hBuf     = new int[outCount];
+            const size_t outCount = batch_size * beam_width * seq_len;
+
             std::vector<int> hBuf(outCount);
+
             ft::cudaD2Hcpy(hBuf.data(), d_output_ids, outCount);
             ft::cudaD2Hcpy(seq_lens.data(), d_seq_lens, batch_size);
+
             std::cout << "sequence length: ";
             for (int i = 0; i < batch_size; ++i) {
                 std::cout << (i ? ", " : "") << seq_lens[i];
             }
             std::cout << "\n";
-            {
-                std::cout << "Writing " << outCount << " elements\n";
-                int zeroCount = 0;
-                for (size_t i = 0; i < outCount; i++) {
-                    if (hBuf[i] == int(0))
-                        zeroCount++;
-                    outFile << hBuf[i] << " ";
-                    if ((i + 1) % (seq_len) == 0)
-                        outFile << std::endl;
-
-                    if (i < 10)
-                        printf("%5d ", hBuf[i]);
-                    if ((i + 1) % (seq_len) == 0 && i < 10)
-                        std::cout << std::endl;
+
+            for (int i = 0; i < batch_size; ++i) {
+                outFile << (i ? "\n" : "");
+                auto buf = hBuf.data() + seq_len * i;
+                for (int j = 0; j < seq_lens[i]; ++j) {
+                    outFile << buf[j] << " ";
                 }
-                std::cout << std::endl << "zeroCount = " << zeroCount << std::endl;
             }
         }
     }
@@ -475,7 +472,7 @@ int main(int argc, char* argv[])
     }
     cudaDeviceSynchronize();
 
-    if (1) {
+    if (0) {
         // test time
         auto start = std::chrono::high_resolution_clock::now();
 

diff --git a/src/turbomind/kernels/CMakeLists.txt b/src/turbomind/kernels/CMakeLists.txt
@@ -71,3 +71,4 @@ set_property(TARGET custom_ar_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET custom_ar_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 
 add_subdirectory(gemm_s_f16)
+add_subdirectory(decoder_multihead_attention)
diff --git a/...mind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_128.cu b/...mind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_128.cu
@@ -43,6 +43,12 @@
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+// cudaFuncAttributes attr{};                                                                                         \
+// cudaFuncGetAttributes(&attr, func);                                                                                \
+// std::cout << "static_smem_sz: " << attr.sharedSizeBytes << std::endl;                                              \
+// std::cout << "max_dynamic_smem: " << attr.maxDynamicSharedSizeBytes << std::endl;                                  \
+// std::cout << "dynamic_smem_sz: " << smem_sz << std::endl;                                                          \
+
 template<typename T, int Dh, int Dh_MAX, typename KERNEL_PARAMS_TYPE>
 void mmha_launch_kernel(const KERNEL_PARAMS_TYPE& params, const cudaStream_t& stream)
 {

diff --git a/...ernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh b/...ernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
@@ -79,8 +79,7 @@ namespace mmha {
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template<typename T, int Dh>
-struct Qk_vec_m_ {
-};
+struct Qk_vec_m_ {};
 
 template<>
 struct Qk_vec_m_<float, 32> {
@@ -180,8 +179,7 @@ struct Qk_vec_k_<__nv_fp8_e4m3, 256> {
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template<typename T, int THREADS_PER_KEY>
-struct K_vec_m_ {
-};
+struct K_vec_m_ {};
 
 template<>
 struct K_vec_m_<float, 4> {
@@ -262,8 +260,7 @@ struct K_vec_k_<__nv_fp8_e4m3, 1> {
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template<typename T, int V_VEC_SIZE>
-struct V_vec_m_ {
-};
+struct V_vec_m_ {};
 
 template<>
 struct V_vec_m_<float, 1> {
@@ -343,8 +340,7 @@ struct V_vec_k_<__nv_fp8_e4m3, 16> {
 
 #ifdef MMHA_USE_FP32_ACUM_FOR_FMA
 template<typename T>
-struct Qk_vec_acum_fp32_ {
-};
+struct Qk_vec_acum_fp32_ {};
 
 template<>
 struct Qk_vec_acum_fp32_<float> {
@@ -426,8 +422,7 @@ struct Qk_vec_acum_fp32_<fp8_4_t> {
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template<typename T>
-struct K_vec_acum_fp32_ {
-};
+struct K_vec_acum_fp32_ {};
 
 template<>
 struct K_vec_acum_fp32_<float> {
@@ -489,8 +484,7 @@ struct K_vec_acum_fp32_<fp8_4_t> {
 
 #ifdef MMHA_USE_FP32_ACUM_FOR_OUT
 template<typename T>
-struct V_vec_acum_fp32_ {
-};
+struct V_vec_acum_fp32_ {};
 
 template<>
 struct V_vec_acum_fp32_<float> {
@@ -1472,6 +1466,8 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
         }
         // We don't need to apply the linear position bias here since qi - ki = 0 yields the position bias 0.
 
+        printf("QK_last[%d] = %f\n", hi, qk);
+
         qk_max                        = qk;
         qk_smem[tlength - first_step] = qk;
         // qk_smem[params.timestep] = qk;
@@ -1596,6 +1592,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
 
                 qk += mul<float, T, float>(params.linear_bias_slopes[hi], dist);
             }
+            // printf("QK_%d = %f\n", (int)ti, qk);
             qk_max                   = is_mask ? qk_max : fmaxf(qk_max, qk);
             qk_smem[ti - first_step] = qk;
         }
@@ -1632,6 +1629,10 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
     // Broadcast to all the threads in the warp.
     qk_max = __shfl_sync(uint32_t(-1), qk_max, 0);
 
+    if (threadIdx.x == 0) {
+        printf("QK_MAX[%d] = %f\n", hi, (float)qk_max);
+    }
+
     // Compute the logits and start the sum.
     float sum = 0.f;
     // for( int ti = tidx; ti <= params.timestep; ti += THREADS_PER_BLOCK ) {
@@ -1657,6 +1658,10 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
     // Compute the sum.
     sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], sum);
 
+    if (threadIdx.x == 0) {
+        printf("SUM[%d] = %f\n", hi, (float)sum);
+    }
+
     // Normalize the logits.
     float inv_sum = __fdividef(1.f, sum + 1.e-6f);
Original file line number	Diff line number	Diff line change
Expand Up		@@ -71,3 +71,4 @@ set_property(TARGET custom_ar_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
		set_property(TARGET custom_ar_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)

		add_subdirectory(gemm_s_f16)
		add_subdirectory(decoder_multihead_attention)