Optimize for throughput (#701)

* tmp * update * update * optimize for throughput * update * fix eos * clean up * fix serving * fix indexed copy * minor * minor --------- Co-authored-by: lvhan028 <[email protected]>
InternLM · Nov 20, 2023 · 911c0a8 · 911c0a8
1 parent 65d735b
commit 911c0a8
Show file tree

Hide file tree

Showing 24 changed files with 744 additions and 511 deletions.
diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
@@ -448,8 +448,8 @@ int main(int argc, char* argv[])
 
             std::vector<int> hBuf(outCount);
 
-            ft::cudaD2Hcpy(hBuf.data(), d_output_ids, outCount);
-            ft::cudaD2Hcpy(seq_lens.data(), d_seq_lens, batch_size);
+            ft::cudaAutoCpy(hBuf.data(), d_output_ids, outCount);
+            ft::cudaAutoCpy(seq_lens.data(), d_seq_lens, batch_size);
 
             std::cout << "sequence length: ";
             for (int i = 0; i < batch_size; ++i) {

diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
@@ -350,7 +350,7 @@ def _broadcast_np(data, dtype, shape=(batch_size, )):
             outputs = _tm_dict_to_torch_dict(tm_outputs)
 
             output_ids = outputs['output_ids'][:, 0, :]
-            sequence_length = outputs['sequence_length'].long()[:, 0].cpu()
+            sequence_length = outputs['sequence_length'].long()[:, 0]
             output_ids = [
                 output_id[s:l] for output_id, s, l in zip(
                     output_ids, seq_start, sequence_length)
@@ -366,7 +366,6 @@ def _broadcast_np(data, dtype, shape=(batch_size, )):
                     outputs.append((output[:-1], len_))
                 else:
                     outputs.append((output, len_))
-
             yield outputs
 
             if finish:

diff --git a/src/turbomind/kernels/gemm_s_f16/common.h b/src/turbomind/kernels/gemm_s_f16/common.h
@@ -236,15 +236,88 @@ __inline__ __device__ half2 apply_Q(const half2& x, const half2& q)
 
 template<typename T, int N>
 struct Array {
-    T a[N];
 
-    __device__ __host__ constexpr T& operator[](int i) noexcept
+    using value_type      = T;
+    using size_type       = int;
+    using difference_type = int;
+    using reference       = value_type&;
+    using const_reference = const value_type&;
+    using pointer         = value_type*;
+    using const_pointer   = const value_type*;
+    using iterator        = pointer;
+    using const_iterator  = const_pointer;
+
+    static_assert(N > 0);
+
+    T __a[N];
+
+    __device__ __host__ constexpr reference operator[](size_type i) noexcept
+    {
+        return __a[i];
+    }
+    __device__ __host__ constexpr const_reference operator[](size_type i) const noexcept
+    {
+        return __a[i];
+    }
+
+    __device__ __host__ constexpr reference front() noexcept
+    {
+        return *begin();
+    }
+
+    __device__ __host__ constexpr const_reference front() const noexcept
+    {
+        return *begin();
+    }
+
+    __device__ __host__ constexpr reference back() noexcept
+    {
+        return *(end() - 1);
+    }
+
+    __device__ __host__ constexpr const_reference back() const noexcept
+    {
+        return *(end() - 1);
+    }
+
+    __device__ __host__ constexpr pointer data() noexcept
     {
-        return a[i];
+        return &__a[0];
     }
-    __device__ __host__ constexpr const T& operator[](int i) const noexcept
+
+    __device__ __host__ constexpr const_pointer data() const noexcept
+    {
+        return &__a[0];
+    }
+
+    __device__ __host__ constexpr iterator begin() noexcept
+    {
+        return data();
+    }
+
+    __device__ __host__ constexpr const_iterator begin() const noexcept
+    {
+        return data();
+    }
+
+    __device__ __host__ constexpr iterator end() noexcept
+    {
+        return data() + N;
+    }
+
+    __device__ __host__ constexpr const_iterator end() const noexcept
+    {
+        return data() + N;
+    }
+
+    __device__ __host__ constexpr std::integral_constant<int, N> size() const noexcept
+    {
+        return {};
+    }
+
+    __device__ __host__ constexpr std::false_type empty() const noexcept
     {
-        return a[i];
+        return {};
     }
 };
 

diff --git a/src/turbomind/layers/DynamicDecodeLayer.cc b/src/turbomind/layers/DynamicDecodeLayer.cc
@@ -188,6 +188,7 @@ void DynamicDecodeLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_
      *
      * output_tensors:
      *   \param  output_ids [max_seq_len, batch_size]
+     *   \param  curand_state [local_batch_size]
      *   \param  finished [batch_size * beam_width], optional
      *   \param  should_stop [1] on cpu
      *   \param  cum_log_probs [batch_size * beam_width], necessary in beam search
@@ -276,7 +277,8 @@ void DynamicDecodeLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_
                 {"input_lengths", input_lengths.slice({local_batch_size, beam_width}, local_batch_offset)});
         }
 
-        TensorMap decode_output_tensors({{"output_ids", output_tensors->at("output_ids")}});
+        TensorMap decode_output_tensors({{"output_ids", output_tensors->at("output_ids")},  //
+                                         {"curand_state", output_tensors->at("curand_state")}});
         if (output_tensors->isExist("sequence_length")) {
             Tensor sequence_length = output_tensors->at("sequence_length");
             decode_output_tensors.insert(

diff --git a/src/turbomind/layers/DynamicDecodeLayer.h b/src/turbomind/layers/DynamicDecodeLayer.h
@@ -53,15 +53,6 @@ class DynamicDecodeLayer: public BaseLayer {
     int* h_pinned_finished_sum_ = nullptr;
 
 public:
-    curandState_t* topk_curandstate_buf()
-    {
-        return static_cast<BaseSamplingLayer<T>*>(topk_decode_)->curandstate_buf();
-    }
-    curandState_t* topp_curandstate_buf()
-    {
-        return static_cast<BaseSamplingLayer<T>*>(topp_decode_)->curandstate_buf();
-    }
-
     DynamicDecodeLayer(size_t           vocab_size,
                        size_t           vocab_size_padded,
                        int              end_id,

diff --git a/src/turbomind/layers/sampling_layers/BaseSamplingLayer.cc b/src/turbomind/layers/sampling_layers/BaseSamplingLayer.cc
@@ -30,10 +30,6 @@ template<typename T>
 void BaseSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p)
 {
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    curandstate_buf_ = reinterpret_cast<curandState_t*>(
-        allocator_->reMalloc(curandstate_buf_, sizeof(curandState_t) * batch_size, false));
-    random_seeds_buf_ = reinterpret_cast<unsigned long long*>(
-        allocator_->reMalloc(random_seeds_buf_, sizeof(unsigned long long) * batch_size, false));
     temperature_buf_ =
         reinterpret_cast<float*>(allocator_->reMalloc(temperature_buf_, sizeof(float) * batch_size, false));
     repetition_penalty_buf_ =
@@ -58,8 +54,6 @@ void BaseSamplingLayer<T>::freeBuffer()
 {
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
     if (is_allocate_buffer_) {
-        allocator_->free((void**)(&curandstate_buf_));
-        allocator_->free((void**)(&random_seeds_buf_));
         allocator_->free((void**)(&temperature_buf_));
         allocator_->free((void**)(&repetition_penalty_buf_));
         allocator_->free((void**)(&min_lengths_buf_));
@@ -128,32 +122,6 @@ void BaseSamplingLayer<T>::setup(const size_t batch_size, const size_t beam_widt
     Tensor runtime_top_p = runtime_args->isExist("runtime_top_p") ? runtime_args->at("runtime_top_p") : Tensor();
     allocateBuffer(batch_size, runtime_top_k, runtime_top_p);
 
-    // If runtime argument has single random seed, using this random seed to initialize the random table of all
-    // sentences. If the argument has [batch_size] random seeds, initializing the random table by different random seeds
-    // respectively. If no random seed, initialize the random table of all sentences by 0 directly.
-    if (runtime_args->isExist("random_seed")) {
-        Tensor random_seeds = runtime_args->at("random_seed");
-        FT_CHECK_WITH_INFO(random_seeds.shape.size() == 1
-                               && (random_seeds.size() == 1 || random_seeds.size() == batch_size),
-                           fmtstr("random_seeds must be of shape [1] or [batch_size(%ld)], got random_seeds.shape=%s",
-                                  batch_size,
-                                  vec2str(random_seeds.shape).c_str()));
-        if (random_seeds.size() == 1) {
-            invokeCurandInitialize(curandstate_buf_, batch_size, random_seeds.getVal<unsigned long long>(), stream_);
-            sync_check_cuda_error();
-        }
-        else {
-            unsigned long long* random_seed_ptr = random_seeds.getPtr<unsigned long long>();
-            cudaAutoCpy(random_seeds_buf_, random_seed_ptr, batch_size, stream_);
-            invokeCurandBatchInitialize(curandstate_buf_, batch_size, random_seeds_buf_, stream_);
-            sync_check_cuda_error();
-        }
-    }
-    else {
-        // Initialize curand states using the default seed 0.
-        invokeCurandInitialize(curandstate_buf_, batch_size, 0, stream_);
-    }
-
     // Setup penalties.
     const float default_temperature = 1.0f;
     Tensor      temperature         = runtime_args->isExist("temperature") ?

diff --git a/src/turbomind/layers/sampling_layers/BaseSamplingLayer.h b/src/turbomind/layers/sampling_layers/BaseSamplingLayer.h
@@ -33,10 +33,8 @@ class BaseSamplingLayer: public DynamicDecodeBaseLayer {
     size_t vocab_size_;
     size_t vocab_size_padded_;
 
-    size_t              sampling_workspace_size_;
-    void*               sampling_workspace_ = nullptr;
-    curandState_t*      curandstate_buf_    = nullptr;
-    unsigned long long* random_seeds_buf_   = nullptr;
+    size_t sampling_workspace_size_;
+    void*  sampling_workspace_ = nullptr;
 
     float* temperature_buf_        = nullptr;
     float* repetition_penalty_buf_ = nullptr;
@@ -59,11 +57,6 @@ class BaseSamplingLayer: public DynamicDecodeBaseLayer {
     virtual void allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p);
 
 public:
-    curandState_t* curandstate_buf()
-    {
-        return curandstate_buf_;
-    }
-
     BaseSamplingLayer(size_t             max_batch_size,
                       size_t             vocab_size,
                       size_t             vocab_size_padded,

diff --git a/src/turbomind/layers/sampling_layers/TopKSamplingLayer.cu b/src/turbomind/layers/sampling_layers/TopKSamplingLayer.cu
@@ -16,6 +16,7 @@
  */
 
 #include <float.h>
+#include <sstream>
 
 #include "src/turbomind/kernels/sampling_topk_kernels.h"
 #include "src/turbomind/kernels/sampling_topp_kernels.h"
@@ -199,6 +200,7 @@ void TopKSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* inp
 
     // output_tensors:
     //      output_ids [max_seq_len, batch_size]
+    //      curand_state [local_batch_size]
     //      finished [local_batch_size], optional
     //      sequence_length [local_batch_size], optional
     //      cum_log_probs [batch_size], must be float*, optional
@@ -255,7 +257,7 @@ void TopKSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* inp
         output_tensors->at("finished", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<bool>(),
         cum_log_probs,
         output_log_probs,
-        curandstate_buf_ + ite * local_batch_size,
+        output_tensors->at("curand_state").getPtr<curandState_t>() + ite * local_batch_size,
         (int)runtime_max_top_k_,  // useless because runtime_top_k_buf_ is never nullptr. Keep for legacy.
         (int*)(runtime_top_k_buf_ + ite * local_batch_size),
         1.0f,  // useless because runtime_top_p_buf_ is never nullptr. Keep for legacy.

diff --git a/src/turbomind/layers/sampling_layers/TopKSamplingLayer.h b/src/turbomind/layers/sampling_layers/TopKSamplingLayer.h
@@ -40,8 +40,6 @@ class TopKSamplingLayer: public BaseSamplingLayer<T> {
 
     using BaseSamplingLayer<T>::sampling_workspace_size_;
     using BaseSamplingLayer<T>::sampling_workspace_;
-    using BaseSamplingLayer<T>::curandstate_buf_;
-    using BaseSamplingLayer<T>::random_seeds_buf_;
     using BaseSamplingLayer<T>::skip_decode_buf_;
     using BaseSamplingLayer<T>::skip_decode_;
     using BaseSamplingLayer<T>::skip_any_;

diff --git a/src/turbomind/layers/sampling_layers/TopPSamplingLayer.cu b/src/turbomind/layers/sampling_layers/TopPSamplingLayer.cu
@@ -132,7 +132,7 @@ void TopPSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tenso
                           topp_id_vals_buf_,
                           topp_offset_buf_,
                           begin_topp_offset_buf_,
-                          curandstate_buf_,
+                          nullptr,  // not used when workspace is null
                           batch_size,
                           vocab_size_padded_,
                           nullptr,
@@ -267,6 +267,7 @@ void TopPSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* inp
 
     * output_tensors:
     *   \param  output_ids [max_seq_len, batch_size]
+    *   \param  curand_state [local_batch_size]
     *   \param  finished [local_batch_size], optional
     *   \param  sequence_length [local_batch_size], optional
     *   \param  cum_log_probs [batch_size], must be float*, optional
@@ -319,7 +320,7 @@ void TopPSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* inp
         topp_id_vals_buf_,
         topp_offset_buf_,
         begin_topp_offset_buf_,
-        curandstate_buf_ + ite * local_batch_size,
+        output_tensors->at("curand_state").getPtr<curandState_t>() + ite * local_batch_size,
         local_batch_size,
         vocab_size_padded_,
         input_tensors->at("end_id").getPtr<int>(),

diff --git a/src/turbomind/layers/sampling_layers/TopPSamplingLayer.h b/src/turbomind/layers/sampling_layers/TopPSamplingLayer.h
@@ -48,8 +48,6 @@ class TopPSamplingLayer: public BaseSamplingLayer<T> {
 
     using BaseSamplingLayer<T>::sampling_workspace_size_;
     using BaseSamplingLayer<T>::sampling_workspace_;
-    using BaseSamplingLayer<T>::curandstate_buf_;
-    using BaseSamplingLayer<T>::random_seeds_buf_;
     using BaseSamplingLayer<T>::skip_decode_buf_;
     using BaseSamplingLayer<T>::skip_decode_;
     using BaseSamplingLayer<T>::skip_any_;