Skip to content

Commit

Permalink
Optimize for throughput (#701)
Browse files Browse the repository at this point in the history
* tmp

* update

* update

* optimize for throughput

* update

* fix eos

* clean up

* fix serving

* fix indexed copy

* minor

* minor

---------

Co-authored-by: lvhan028 <[email protected]>
  • Loading branch information
lzhangzz and lvhan028 authored Nov 20, 2023
1 parent 65d735b commit 911c0a8
Show file tree
Hide file tree
Showing 24 changed files with 744 additions and 511 deletions.
4 changes: 2 additions & 2 deletions examples/cpp/llama/llama_triton_example.cc
Original file line number Diff line number Diff line change
Expand Up @@ -448,8 +448,8 @@ int main(int argc, char* argv[])

std::vector<int> hBuf(outCount);

ft::cudaD2Hcpy(hBuf.data(), d_output_ids, outCount);
ft::cudaD2Hcpy(seq_lens.data(), d_seq_lens, batch_size);
ft::cudaAutoCpy(hBuf.data(), d_output_ids, outCount);
ft::cudaAutoCpy(seq_lens.data(), d_seq_lens, batch_size);

std::cout << "sequence length: ";
for (int i = 0; i < batch_size; ++i) {
Expand Down
3 changes: 1 addition & 2 deletions lmdeploy/turbomind/turbomind.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ def _broadcast_np(data, dtype, shape=(batch_size, )):
outputs = _tm_dict_to_torch_dict(tm_outputs)

output_ids = outputs['output_ids'][:, 0, :]
sequence_length = outputs['sequence_length'].long()[:, 0].cpu()
sequence_length = outputs['sequence_length'].long()[:, 0]
output_ids = [
output_id[s:l] for output_id, s, l in zip(
output_ids, seq_start, sequence_length)
Expand All @@ -366,7 +366,6 @@ def _broadcast_np(data, dtype, shape=(batch_size, )):
outputs.append((output[:-1], len_))
else:
outputs.append((output, len_))

yield outputs

if finish:
Expand Down
83 changes: 78 additions & 5 deletions src/turbomind/kernels/gemm_s_f16/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,15 +236,88 @@ __inline__ __device__ half2 apply_Q(const half2& x, const half2& q)

template<typename T, int N>
struct Array {
T a[N];

__device__ __host__ constexpr T& operator[](int i) noexcept
using value_type = T;
using size_type = int;
using difference_type = int;
using reference = value_type&;
using const_reference = const value_type&;
using pointer = value_type*;
using const_pointer = const value_type*;
using iterator = pointer;
using const_iterator = const_pointer;

static_assert(N > 0);

T __a[N];

__device__ __host__ constexpr reference operator[](size_type i) noexcept
{
return __a[i];
}
__device__ __host__ constexpr const_reference operator[](size_type i) const noexcept
{
return __a[i];
}

__device__ __host__ constexpr reference front() noexcept
{
return *begin();
}

__device__ __host__ constexpr const_reference front() const noexcept
{
return *begin();
}

__device__ __host__ constexpr reference back() noexcept
{
return *(end() - 1);
}

__device__ __host__ constexpr const_reference back() const noexcept
{
return *(end() - 1);
}

__device__ __host__ constexpr pointer data() noexcept
{
return a[i];
return &__a[0];
}
__device__ __host__ constexpr const T& operator[](int i) const noexcept

__device__ __host__ constexpr const_pointer data() const noexcept
{
return &__a[0];
}

__device__ __host__ constexpr iterator begin() noexcept
{
return data();
}

__device__ __host__ constexpr const_iterator begin() const noexcept
{
return data();
}

__device__ __host__ constexpr iterator end() noexcept
{
return data() + N;
}

__device__ __host__ constexpr const_iterator end() const noexcept
{
return data() + N;
}

__device__ __host__ constexpr std::integral_constant<int, N> size() const noexcept
{
return {};
}

__device__ __host__ constexpr std::false_type empty() const noexcept
{
return a[i];
return {};
}
};

Expand Down
4 changes: 3 additions & 1 deletion src/turbomind/layers/DynamicDecodeLayer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ void DynamicDecodeLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_
*
* output_tensors:
* \param output_ids [max_seq_len, batch_size]
* \param curand_state [local_batch_size]
* \param finished [batch_size * beam_width], optional
* \param should_stop [1] on cpu
* \param cum_log_probs [batch_size * beam_width], necessary in beam search
Expand Down Expand Up @@ -276,7 +277,8 @@ void DynamicDecodeLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_
{"input_lengths", input_lengths.slice({local_batch_size, beam_width}, local_batch_offset)});
}

TensorMap decode_output_tensors({{"output_ids", output_tensors->at("output_ids")}});
TensorMap decode_output_tensors({{"output_ids", output_tensors->at("output_ids")}, //
{"curand_state", output_tensors->at("curand_state")}});
if (output_tensors->isExist("sequence_length")) {
Tensor sequence_length = output_tensors->at("sequence_length");
decode_output_tensors.insert(
Expand Down
9 changes: 0 additions & 9 deletions src/turbomind/layers/DynamicDecodeLayer.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,6 @@ class DynamicDecodeLayer: public BaseLayer {
int* h_pinned_finished_sum_ = nullptr;

public:
curandState_t* topk_curandstate_buf()
{
return static_cast<BaseSamplingLayer<T>*>(topk_decode_)->curandstate_buf();
}
curandState_t* topp_curandstate_buf()
{
return static_cast<BaseSamplingLayer<T>*>(topp_decode_)->curandstate_buf();
}

DynamicDecodeLayer(size_t vocab_size,
size_t vocab_size_padded,
int end_id,
Expand Down
32 changes: 0 additions & 32 deletions src/turbomind/layers/sampling_layers/BaseSamplingLayer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,6 @@ template<typename T>
void BaseSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p)
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
curandstate_buf_ = reinterpret_cast<curandState_t*>(
allocator_->reMalloc(curandstate_buf_, sizeof(curandState_t) * batch_size, false));
random_seeds_buf_ = reinterpret_cast<unsigned long long*>(
allocator_->reMalloc(random_seeds_buf_, sizeof(unsigned long long) * batch_size, false));
temperature_buf_ =
reinterpret_cast<float*>(allocator_->reMalloc(temperature_buf_, sizeof(float) * batch_size, false));
repetition_penalty_buf_ =
Expand All @@ -58,8 +54,6 @@ void BaseSamplingLayer<T>::freeBuffer()
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) {
allocator_->free((void**)(&curandstate_buf_));
allocator_->free((void**)(&random_seeds_buf_));
allocator_->free((void**)(&temperature_buf_));
allocator_->free((void**)(&repetition_penalty_buf_));
allocator_->free((void**)(&min_lengths_buf_));
Expand Down Expand Up @@ -128,32 +122,6 @@ void BaseSamplingLayer<T>::setup(const size_t batch_size, const size_t beam_widt
Tensor runtime_top_p = runtime_args->isExist("runtime_top_p") ? runtime_args->at("runtime_top_p") : Tensor();
allocateBuffer(batch_size, runtime_top_k, runtime_top_p);

// If runtime argument has single random seed, using this random seed to initialize the random table of all
// sentences. If the argument has [batch_size] random seeds, initializing the random table by different random seeds
// respectively. If no random seed, initialize the random table of all sentences by 0 directly.
if (runtime_args->isExist("random_seed")) {
Tensor random_seeds = runtime_args->at("random_seed");
FT_CHECK_WITH_INFO(random_seeds.shape.size() == 1
&& (random_seeds.size() == 1 || random_seeds.size() == batch_size),
fmtstr("random_seeds must be of shape [1] or [batch_size(%ld)], got random_seeds.shape=%s",
batch_size,
vec2str(random_seeds.shape).c_str()));
if (random_seeds.size() == 1) {
invokeCurandInitialize(curandstate_buf_, batch_size, random_seeds.getVal<unsigned long long>(), stream_);
sync_check_cuda_error();
}
else {
unsigned long long* random_seed_ptr = random_seeds.getPtr<unsigned long long>();
cudaAutoCpy(random_seeds_buf_, random_seed_ptr, batch_size, stream_);
invokeCurandBatchInitialize(curandstate_buf_, batch_size, random_seeds_buf_, stream_);
sync_check_cuda_error();
}
}
else {
// Initialize curand states using the default seed 0.
invokeCurandInitialize(curandstate_buf_, batch_size, 0, stream_);
}

// Setup penalties.
const float default_temperature = 1.0f;
Tensor temperature = runtime_args->isExist("temperature") ?
Expand Down
11 changes: 2 additions & 9 deletions src/turbomind/layers/sampling_layers/BaseSamplingLayer.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,8 @@ class BaseSamplingLayer: public DynamicDecodeBaseLayer {
size_t vocab_size_;
size_t vocab_size_padded_;

size_t sampling_workspace_size_;
void* sampling_workspace_ = nullptr;
curandState_t* curandstate_buf_ = nullptr;
unsigned long long* random_seeds_buf_ = nullptr;
size_t sampling_workspace_size_;
void* sampling_workspace_ = nullptr;

float* temperature_buf_ = nullptr;
float* repetition_penalty_buf_ = nullptr;
Expand All @@ -59,11 +57,6 @@ class BaseSamplingLayer: public DynamicDecodeBaseLayer {
virtual void allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p);

public:
curandState_t* curandstate_buf()
{
return curandstate_buf_;
}

BaseSamplingLayer(size_t max_batch_size,
size_t vocab_size,
size_t vocab_size_padded,
Expand Down
4 changes: 3 additions & 1 deletion src/turbomind/layers/sampling_layers/TopKSamplingLayer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
*/

#include <float.h>
#include <sstream>

#include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/turbomind/kernels/sampling_topp_kernels.h"
Expand Down Expand Up @@ -199,6 +200,7 @@ void TopKSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* inp

// output_tensors:
// output_ids [max_seq_len, batch_size]
// curand_state [local_batch_size]
// finished [local_batch_size], optional
// sequence_length [local_batch_size], optional
// cum_log_probs [batch_size], must be float*, optional
Expand Down Expand Up @@ -255,7 +257,7 @@ void TopKSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* inp
output_tensors->at("finished", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<bool>(),
cum_log_probs,
output_log_probs,
curandstate_buf_ + ite * local_batch_size,
output_tensors->at("curand_state").getPtr<curandState_t>() + ite * local_batch_size,
(int)runtime_max_top_k_, // useless because runtime_top_k_buf_ is never nullptr. Keep for legacy.
(int*)(runtime_top_k_buf_ + ite * local_batch_size),
1.0f, // useless because runtime_top_p_buf_ is never nullptr. Keep for legacy.
Expand Down
2 changes: 0 additions & 2 deletions src/turbomind/layers/sampling_layers/TopKSamplingLayer.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@ class TopKSamplingLayer: public BaseSamplingLayer<T> {

using BaseSamplingLayer<T>::sampling_workspace_size_;
using BaseSamplingLayer<T>::sampling_workspace_;
using BaseSamplingLayer<T>::curandstate_buf_;
using BaseSamplingLayer<T>::random_seeds_buf_;
using BaseSamplingLayer<T>::skip_decode_buf_;
using BaseSamplingLayer<T>::skip_decode_;
using BaseSamplingLayer<T>::skip_any_;
Expand Down
5 changes: 3 additions & 2 deletions src/turbomind/layers/sampling_layers/TopPSamplingLayer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ void TopPSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tenso
topp_id_vals_buf_,
topp_offset_buf_,
begin_topp_offset_buf_,
curandstate_buf_,
nullptr, // not used when workspace is null
batch_size,
vocab_size_padded_,
nullptr,
Expand Down Expand Up @@ -267,6 +267,7 @@ void TopPSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* inp
* output_tensors:
* \param output_ids [max_seq_len, batch_size]
* \param curand_state [local_batch_size]
* \param finished [local_batch_size], optional
* \param sequence_length [local_batch_size], optional
* \param cum_log_probs [batch_size], must be float*, optional
Expand Down Expand Up @@ -319,7 +320,7 @@ void TopPSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* inp
topp_id_vals_buf_,
topp_offset_buf_,
begin_topp_offset_buf_,
curandstate_buf_ + ite * local_batch_size,
output_tensors->at("curand_state").getPtr<curandState_t>() + ite * local_batch_size,
local_batch_size,
vocab_size_padded_,
input_tensors->at("end_id").getPtr<int>(),
Expand Down
2 changes: 0 additions & 2 deletions src/turbomind/layers/sampling_layers/TopPSamplingLayer.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,6 @@ class TopPSamplingLayer: public BaseSamplingLayer<T> {

using BaseSamplingLayer<T>::sampling_workspace_size_;
using BaseSamplingLayer<T>::sampling_workspace_;
using BaseSamplingLayer<T>::curandstate_buf_;
using BaseSamplingLayer<T>::random_seeds_buf_;
using BaseSamplingLayer<T>::skip_decode_buf_;
using BaseSamplingLayer<T>::skip_decode_;
using BaseSamplingLayer<T>::skip_any_;
Expand Down
Loading

0 comments on commit 911c0a8

Please sign in to comment.