Skip to content

Commit

Permalink
reduce gil switching
Browse files Browse the repository at this point in the history
  • Loading branch information
irexyc committed Sep 12, 2023
1 parent ce21a31 commit fcd70ab
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 3 deletions.
1 change: 1 addition & 0 deletions src/turbomind/models/llama/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ target_link_libraries(Llama PUBLIC CUDA::cudart
nccl_utils
cuda_utils
logger
pycb_utils
llama_fmha)

if (NOT MSVC)
Expand Down
7 changes: 5 additions & 2 deletions src/turbomind/models/llama/LlamaBatch.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "src/turbomind/models/llama/llama_utils.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/pycb_utils.h"
#include <cstdint>
#include <iomanip>
#include <sstream>
Expand Down Expand Up @@ -899,8 +900,9 @@ void LlamaBatch<T>::outputContextLogits(T* context_decoder_

if (context_logits_buf_ == nullptr) {
NcclGuard guard(llama_->tensor_para_, stream_, true);
context_logits_buf_ = (float*)allocator_->malloc(sizeof(float) * llama_->vocab_size_padded_ * max_context_token_num_);
const auto tp = llama_->tensor_para_.world_size_;
context_logits_buf_ =
(float*)allocator_->malloc(sizeof(float) * llama_->vocab_size_padded_ * max_context_token_num_);
const auto tp = llama_->tensor_para_.world_size_;
if (tp > 1) {
FT_CHECK(llama_->vocab_size_padded_ % tp == 0);
const auto local_vocab_size = llama_->vocab_size_padded_ / tp;
Expand Down Expand Up @@ -941,6 +943,7 @@ void LlamaBatch<T>::finish()
for (int i = 0; i < batch_size_; ++i) {
FT_CHECK(requests_[i] != nullptr);
if (requests_[i]->stream_cb && rank_ == 0) {
set_batch_info(i, batch_size_);
requests_[i]->stream_cb(&requests_[i]->outputs[rank_].get());
}
}
Expand Down
14 changes: 13 additions & 1 deletion src/turbomind/python/bind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/nccl_utils.h"
#include "src/turbomind/utils/pycb_utils.h"
#include <cuda_runtime.h>
#include <memory>
#include <pybind11/functional.h>
Expand Down Expand Up @@ -329,7 +330,18 @@ PYBIND11_MODULE(_turbomind, m)
.def(
"register_callback",
[](AbstractTransformerModelInstance* self, triton_stream_cb_t cb, py::object ctx) {
self->registerCallback(cb, ctx.ptr());
auto callback = [=](std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> outputs,
void* ctx) {
thread_local PyGILState_STATE gstate;
if (ft::is_first_in_batch()) {
gstate = PyGILState_Ensure();
}
cb(outputs, ctx);
if (ft::is_last_in_batch()) {
PyGILState_Release(gstate);
}
};
self->registerCallback(callback, ctx.ptr());
},
"callback"_a,
"context"_a = nullptr)
Expand Down
3 changes: 3 additions & 0 deletions src/turbomind/utils/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,6 @@ add_library(tensor STATIC Tensor.cc)
set_property(TARGET tensor PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET tensor PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(tensor PUBLIC cuda_utils logger)

add_library(pycb_utils STATIC pycb_utils.cc)
set_property(TARGET pycb_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
31 changes: 31 additions & 0 deletions src/turbomind/utils/pycb_utils.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// Copyright (c) OpenMMLab. All rights reserved.

#include "pycb_utils.h"
#include <memory>

namespace turbomind {

thread_local std::shared_ptr<int> _current;
thread_local std::shared_ptr<int> _total;

void set_batch_info(int current, int total)
{
if (!_current) {
_current = std::make_shared<int>();
_total = std::make_shared<int>();
}
*_current = current;
*_total = total;
}

int is_first_in_batch()
{
return *_current == 0;
}

int is_last_in_batch()
{
return *_current == (*_total - 1);
}

} // namespace turbomind
15 changes: 15 additions & 0 deletions src/turbomind/utils/pycb_utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// Copyright (c) OpenMMLab. All rights reserved.

#pragma once

#include <cstdio>

namespace turbomind {

void set_batch_info(int current, int total);

int is_first_in_batch();

int is_last_in_batch();

} // namespace turbomind

0 comments on commit fcd70ab

Please sign in to comment.