-
Notifications
You must be signed in to change notification settings - Fork 445
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Bug] CUDA runtime error #475
Comments
code 能share下吗? |
下面时是我按照 但是用这份代码我反复跑了好多次都没有报错了,不知道是不是我原来封装的接口有问题... 不过这份代码,在tensor_para_size_=2的时候,执行途中大概率卡死,帮忙看看是不是代码哪里写的有问题,
#include "3rdparty/INIReader.h"
#include "tokenizers_cpp.h"
#include <chrono>
#include <memory>
#include <thread>
#include <mutex>
#include "src/turbomind/macro.h"
#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
#include "src/turbomind/utils/custom_ar_comm.h"
#include "src/turbomind/utils/mpi_utils.h"
#include "src/turbomind/utils/nccl_utils.h"
#include "src/turbomind/utils/nvtx_utils.h"
#include "src/turbomind/utils/word_list.h"
namespace ft = turbomind;
std::unique_ptr<tokenizers::Tokenizer> g_tokenizer = nullptr;
std::string load_bytes_from_file(const std::string& path) {
std::ifstream fs(path, std::ios::in | std::ios::binary);
if (fs.fail()) {
std::cerr << "Cannot open " << path << std::endl;
exit(20);
}
std::string data;
fs.seekg(0, std::ios::end);
size_t size = static_cast<size_t>(fs.tellg());
fs.seekg(0, std::ios::beg);
data.resize(size);
fs.read((char*)data.data(), size);
return data;
}
void init_tokenizer(const std::string& inifile) {
INIReader reader = INIReader(inifile);
if (reader.ParseError() < 0) {
std::cout << "[ERROR] Can't load '" << inifile << "'\n";
exit(10);
}
std::string tokenizerFile = reader.Get("ft_instance_hyperparameter", "tokenizer_path");
auto blob = load_bytes_from_file(tokenizerFile);
// Note: all the current factory APIs takes in-memory blob as input.
// This gives some flexibility on how these blobs can be read.
g_tokenizer = tokenizers::Tokenizer::FromBlobSentencePiece(blob);
}
constexpr const bool kUSE_MPI = true;
int g_node_id = 0;
int g_node_num = 1;
int g_gpu_count = 1;
int g_world_size = 1;
int g_tensor_para_size = 1;
int g_pipeline_para_size = 1;
std::shared_ptr<AbstractTransformerModel> g_model = nullptr;
std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> g_nccl_comms;
std::vector<std::unique_ptr<AbstractTransformerModelInstance>> g_single_model_instances;
int threadCreateSingleModelInstances(const int device_id, const int rank) {
printf("[INFO] rank = %d \n", rank);
ft::check_cuda_error(cudaSetDevice(device_id));
cudaStream_t stream;
ft::check_cuda_error(cudaStreamCreate(&stream));
g_model->createSharedWeights(device_id, rank);
auto model_instance = g_model->createModelInstance(device_id, rank, stream, g_nccl_comms, nullptr);
g_single_model_instances.at(device_id) = std::move(model_instance);
printf("model instance %d is created \n", device_id);
ft::print_mem_usage();
return 0;
}
void init_model(const std::string& inifile, int argc = 0, char** argv = nullptr) {
if (argc == 0) {
const char* custom_args[] = {"lmdeploy_test"};
argc = 1;
argv = const_cast<char**>(custom_args);
}
if (kUSE_MPI) {
ft::mpi::initialize(&argc, &argv);
g_node_id = ft::mpi::getCommWorldRank();
g_node_num = ft::mpi::getCommWorldSize();
}
printf("node_id=%d node_num=%d\n", g_node_id, g_node_num);
// Note: Only supports that all nodes have same gpu count
g_gpu_count = ft::getDeviceCount();
g_world_size = g_node_num * g_gpu_count;
// step 1: Create model
g_model = AbstractTransformerModel::createLlamaModel(inifile);
g_tensor_para_size = g_model->getTensorParaSize();
g_pipeline_para_size = g_model->getPipelineParaSize();
printf(
"world_size=%d tensor_para_size=%d pipeline_para_size=%d\n", g_world_size, g_tensor_para_size, g_pipeline_para_size);
FT_CHECK_WITH_INFO(g_world_size == (g_tensor_para_size * g_pipeline_para_size),
"World Size != Tensor Parallel Size * Pipeline Parallel Size !");
std::cout << g_model->toString();
// step 2: Initialize the NCCL
g_nccl_comms = g_model->createNcclParams(g_node_id);
cudaDeviceSynchronize();
// step 3: Create model instances
g_single_model_instances.resize((size_t)g_gpu_count); //not use, keep shared model alive
std::vector<std::thread> threads;
for (int device_id = 0; device_id < g_gpu_count; device_id++) {
const int rank = g_node_id * g_gpu_count + device_id;
threads.push_back(std::thread(threadCreateSingleModelInstances, device_id, rank));
}
for (auto& t : threads) {
t.join();
}
}
void get_loop_config(const std::string& inifile, int& thread_num, int& repeat_num) {
INIReader reader = INIReader(inifile);
if (reader.ParseError() < 0) {
std::cout << "[ERROR] Can't load '" << inifile << "'\n";
exit(10);
}
thread_num = reader.GetInteger("loop", "thread_num", 1);
repeat_num = reader.GetInteger("loop", "repeat_num", 1);
}
struct RequestParam {
int beam_width;
int max_input_len;
int request_output_len;
float beam_search_diversity_rate;
uint runtime_top_k;
float runtime_top_p;
float temperature;
float len_penalty;
float repetition_penalty;
float presence_penalty;
int min_length;
unsigned long long int random_seed;
int start_id;
int end_id;
unsigned long long request_id;
};
void get_request_param(const std::string& inifile, RequestParam& param) {
INIReader reader = INIReader(inifile);
if (reader.ParseError() < 0) {
std::cout << "[ERROR] Can't load '" << inifile << "'\n";
exit(10);
}
param.beam_width = reader.GetInteger("request", "beam_width");
param.max_input_len = reader.GetInteger("request", "max_input_len");
param.request_output_len = reader.GetInteger("request", "request_output_len");
param.beam_search_diversity_rate = reader.GetFloat("request", "beam_search_diversity_rate");
param.runtime_top_k = reader.GetInteger("request", "top_k");
param.runtime_top_p = reader.GetFloat("request", "top_p");
param.temperature = reader.GetFloat("request", "temperature");
param.len_penalty = reader.GetFloat("request", "len_penalty");
param.repetition_penalty = reader.GetFloat("request", "repetition_penalty");
param.presence_penalty = reader.GetFloat("request", "presence_penalty");
param.min_length = reader.GetInteger("request", "min_length");
param.random_seed = (unsigned long long int)0;
param.start_id = reader.GetInteger("request", "start_id");
param.end_id = reader.GetInteger("request", "end_id");
std::cout << "beam_width:" << param.beam_width << std::endl
<< "max_input_len:" << param.max_input_len << std::endl
<< "request_output_len:" << param.request_output_len << std::endl
<< "beam_search_diversity_rate:" << param.beam_search_diversity_rate << std::endl
<< "runtime_top_k:" << param.runtime_top_k << std::endl
<< "runtime_top_p:" << param.runtime_top_p << std::endl
<< "temperature:" << param.temperature << std::endl
<< "len_penalty:" << param.len_penalty << std::endl
<< "repetition_penalty:" << param.repetition_penalty << std::endl
<< "presence_penalty:" << param.presence_penalty << std::endl
<< "min_length:" << param.min_length << std::endl
<< "start_id:" << param.start_id << std::endl
<< "end_id:" << param.end_id << std::endl;
}
void create_model_instances(std::vector<std::unique_ptr<AbstractTransformerModelInstance>>& model_instances) {
std::vector<std::thread> threads;
model_instances.resize((size_t)g_gpu_count);
threads.clear();
for (int device_id = 0; device_id < g_gpu_count; device_id++) {
const int rank = g_node_id * g_gpu_count + device_id;
threads.emplace_back([device_id, rank, &model_instances]() {
cudaStream_t stream;
ft::check_cuda_error(cudaStreamCreate(&stream));
auto model_instance = g_model->createModelInstance(device_id, rank, stream, g_nccl_comms, nullptr);
model_instances.at(device_id) = std::move(model_instance);
//printf("model instance %d is created \n", device_id);
//ft::print_mem_usage();
});
}
for (auto& t : threads) {
t.join();
}
}
std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>>
broadCastRequest(const std::vector<int>& v_start_ids,
const std::vector<int>& v_start_lengths,
const std::vector<int>& v_bad_words,
const RequestParam param,
std::vector<void*>* h_pointer_record,
std::vector<void*>* d_pointer_record)
{
// broadcast the request to all nodes, and copy "gpu_count" copies on
// different gpu
int size_1 = v_start_ids.size();
int size_2 = v_start_lengths.size();
int size_bad_words = v_bad_words.size();
if (kUSE_MPI) {
ft::mpi::bcast(&size_1, 1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
ft::mpi::bcast(&size_2, 1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
ft::mpi::bcast(&size_bad_words, 1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
}
std::vector<int> v_input_ids(size_1);
std::vector<int> v_input_lengths(size_2);
std::vector<int> v_input_bad_words(size_bad_words);
if (g_node_id == 0) {
memcpy(v_input_ids.data(), v_start_ids.data(), size_1 * sizeof(int));
memcpy(v_input_lengths.data(), v_start_lengths.data(), size_2 * sizeof(int));
memcpy(v_input_bad_words.data(), v_bad_words.data(), size_bad_words * sizeof(int));
}
if (kUSE_MPI) {
ft::mpi::barrier();
}
int request_batch_size = size_2;
int max_input_len = size_1 / size_2;
//std::cerr << "request_batch_size=" << request_batch_size << " max_input_len=" << max_input_len << "\n";
if (kUSE_MPI) {
ft::mpi::bcast(v_input_ids.data(), size_1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
ft::mpi::bcast(v_input_lengths.data(), size_2, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
ft::mpi::bcast(v_input_bad_words.data(), size_bad_words, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
}
std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list;
for (int device_id = 0; device_id < g_gpu_count; device_id++) {
ft::check_cuda_error(cudaSetDevice(device_id));
int* d_input_ids;
// int* d_input_lengths;
int* d_input_bad_words;
if (max_input_len == 0) {
// unconditional case, no input ids, so do nothing.
d_input_ids = nullptr;
// d_input_lengths = nullptr;
max_input_len = 0;
}
else {
// conditional case.
ft::deviceMalloc(&d_input_ids, size_1, false);
// ft::deviceMalloc(&d_input_lengths, size_2, false);
ft::cudaH2Dcpy(d_input_ids, v_input_ids.data(), size_1);
// ft::cudaH2Dcpy(d_input_lengths, v_input_lengths.data(), size_2);
}
if (!v_input_bad_words.empty()) {
ft::deviceMalloc(&d_input_bad_words, size_bad_words, false);
ft::cudaH2Dcpy(d_input_bad_words, v_input_bad_words.data(), size_bad_words);
}
else {
d_input_bad_words = nullptr;
}
uint32_t* request_output_len_ptr = (uint32_t*)malloc(request_batch_size * sizeof(uint32_t));
int* input_lengths_ptr = (int*)malloc(request_batch_size * sizeof(int));
for (int i = 0; i < request_batch_size; i++) {
request_output_len_ptr[i] = param.request_output_len;
input_lengths_ptr[i] = v_input_lengths[i];
}
int* start_ids_ptr = (int*)malloc(request_batch_size * sizeof(int));
int* end_ids_ptr = (int*)malloc(request_batch_size * sizeof(int));
unsigned long long* req_ids_ptr = (unsigned long long*)malloc(request_batch_size * sizeof(unsigned long long));
for (int i = 0; i < request_batch_size; i++) {
start_ids_ptr[i] = param.start_id;
end_ids_ptr[i] = param.end_id;
req_ids_ptr[i] = param.request_id;
}
h_pointer_record->push_back(start_ids_ptr);
h_pointer_record->push_back(end_ids_ptr);
h_pointer_record->push_back(req_ids_ptr);
request_list.push_back(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>(
new std::unordered_map<std::string, triton::Tensor>{
{"input_ids",
triton::Tensor{triton::MEMORY_GPU,
triton::TYPE_INT32,
std::vector<size_t>{(size_t)request_batch_size, (size_t)max_input_len},
d_input_ids}},
{"input_lengths",
triton::Tensor{triton::MEMORY_CPU,
triton::TYPE_INT32,
std::vector<size_t>{(size_t)request_batch_size},
input_lengths_ptr}},
{"request_output_len",
triton::Tensor{triton::MEMORY_CPU,
triton::TYPE_INT32,
std::vector<size_t>{(size_t)request_batch_size},
request_output_len_ptr}},
{"bad_words_list",
triton::Tensor{
triton::MEMORY_GPU, triton::TYPE_INT32, {2, v_input_bad_words.size() / 2}, d_input_bad_words}},
{"start_id",
triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, {(size_t)request_batch_size}, start_ids_ptr}},
{"end_id",
triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, {(size_t)request_batch_size}, end_ids_ptr}},
{"CORRID",
triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT64, {(size_t)request_batch_size}, req_ids_ptr}}
}));
int* beam_width_ptr = (int*)malloc(sizeof(int));
*beam_width_ptr = param.beam_width;
h_pointer_record->push_back(beam_width_ptr);
request_list[device_id]->insert(
{"beam_width",
triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, std::vector<size_t>{1}, beam_width_ptr}});
if (param.beam_width > 1) {
float* beam_search_diversity_rate_ptr = (float*)malloc(sizeof(float));
*beam_search_diversity_rate_ptr = param.beam_search_diversity_rate;
h_pointer_record->push_back(beam_search_diversity_rate_ptr);
request_list[device_id]->insert(
{"beam_search_diversity_rate",
triton::Tensor{
triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, beam_search_diversity_rate_ptr}});
}
else {
if (param.runtime_top_p != 0.0f) {
float* runtime_top_p_ptr = (float*)malloc(sizeof(float));
*runtime_top_p_ptr = param.runtime_top_p;
h_pointer_record->push_back(runtime_top_p_ptr);
request_list[device_id]->insert(
{"runtime_top_p",
triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, runtime_top_p_ptr}});
}
if (param.runtime_top_k != 0) {
uint* runtime_top_k_ptr = (uint*)malloc(sizeof(uint));
*runtime_top_k_ptr = param.runtime_top_k;
h_pointer_record->push_back(runtime_top_k_ptr);
request_list[device_id]->insert(
{"runtime_top_k",
triton::Tensor{
triton::MEMORY_CPU, triton::TYPE_UINT32, std::vector<size_t>{1}, runtime_top_k_ptr}});
}
}
float* temperature_ptr = (float*)malloc(sizeof(float));
*temperature_ptr = param.temperature;
h_pointer_record->push_back(temperature_ptr);
request_list[device_id]->insert(
{"temperature",
triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, temperature_ptr}});
float* len_penalty_ptr = (float*)malloc(sizeof(float));
*len_penalty_ptr = param.len_penalty;
h_pointer_record->push_back(len_penalty_ptr);
request_list[device_id]->insert(
{"len_penalty",
triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, len_penalty_ptr}});
if (param.repetition_penalty != 1.0f) {
float* repetition_penalty_ptr = (float*)malloc(sizeof(float));
*repetition_penalty_ptr = param.repetition_penalty;
h_pointer_record->push_back(repetition_penalty_ptr);
request_list[device_id]->insert(
{"repetition_penalty",
triton::Tensor{
triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, repetition_penalty_ptr}});
}
if (param.presence_penalty != 0.0f) {
float* presence_penalty_ptr = (float*)malloc(sizeof(float));
*presence_penalty_ptr = param.presence_penalty;
h_pointer_record->push_back(presence_penalty_ptr);
request_list[device_id]->insert(
{"presence_penalty",
triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, presence_penalty_ptr}});
}
int* min_length_ptr = (int*)malloc(sizeof(int));
*min_length_ptr = param.min_length;
h_pointer_record->push_back(min_length_ptr);
request_list[device_id]->insert(
{"min_length",
triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, std::vector<size_t>{1}, min_length_ptr}});
unsigned long long int* random_seed_ptr = (unsigned long long int*)malloc(sizeof(unsigned long long int));
*random_seed_ptr = param.random_seed;
h_pointer_record->push_back(random_seed_ptr);
request_list[device_id]->insert(
{"random_seed",
triton::Tensor{triton::MEMORY_CPU, triton::TYPE_UINT64, std::vector<size_t>{1}, random_seed_ptr}});
d_pointer_record->push_back(d_input_ids);
// pointer_record->push_back(d_input_lengths);
d_pointer_record->push_back(d_input_bad_words);
h_pointer_record->push_back(request_output_len_ptr);
h_pointer_record->push_back(input_lengths_ptr);
}
return request_list;
}
std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>>
prepare_request(const RequestParam& param,
std::vector<int>& v_start_ids,
std::vector<void*>* h_pointer_record,
std::vector<void*>* d_pointer_record) {
std::vector<int> v_start_lengths;
int real_size = std::min<int>(v_start_ids.size(), param.max_input_len);
if (v_start_ids.size() < param.max_input_len) {
v_start_ids.resize(param.max_input_len, param.end_id);
}
v_start_lengths.push_back(real_size);
std::vector<int> v_bad_words;
auto request_list =
broadCastRequest(v_start_ids, v_start_lengths, v_bad_words, param, h_pointer_record, d_pointer_record);
return request_list;
}
void stream_callback(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> output, void* ctx){
}
int threadForward(std::unique_ptr<AbstractTransformerModelInstance>* model_instance,
std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> request,
std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>* output_tensors,
const int device_id,
ft::AbstractInstanceComm* comm)
{
ft::check_cuda_error(cudaSetDevice(device_id));
cudaDeviceSynchronize();
(*model_instance)->registerCallback(stream_callback, nullptr);
*output_tensors = (*model_instance)->forward(request, comm);
cudaDeviceSynchronize();
return 0;
}
void extract_output(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> output,
std::vector<int> &hBuf,
std::vector<int> &seq_lens)
{
const int* d_output_ids = (const int*)output.get()->at("output_ids").data;
const int* d_seq_lens = (const int*)output.get()->at("sequence_length").data;
const int batch_size = output.get()->at("output_ids").shape[0];
const int beam_width = output.get()->at("output_ids").shape[1];
const int seq_len = output.get()->at("output_ids").shape[2];
seq_lens.clear();
seq_lens.resize(batch_size);
size_t outCount = batch_size * beam_width * seq_len;
hBuf.clear();
hBuf.resize(outCount);
ft::cudaD2Hcpy(hBuf.data(), d_output_ids, outCount);
ft::cudaD2Hcpy(seq_lens.data(), d_seq_lens, batch_size);
}
void free_pointer(std::vector<void*>& h_pointer_record, std::vector<void*>& d_pointer_record) {
for(auto& p : h_pointer_record) {
if(p) {
free(p);
}
}
for(auto& p : d_pointer_record) {
if(p) {
int* pi = (int*)p;
ft::deviceFree(pi);
}
}
}
std::vector<std::string> g_themes = {"北京", "春天", "苹果", "飞机", "键盘", "咖啡", "饺子", "柳树", "短袖", "纸巾"};
std::mutex ios_mutex;
void test_worker(RequestParam param, int thread_id, int repeat) {
for(int i = 0; i < repeat; i++){
// step 2.1 create instance comm
std::unique_ptr<ft::AbstractInstanceComm> instance_comm = g_model->createInstanceComm(g_gpu_count);
// step 3: Create model instances
std::vector<std::unique_ptr<AbstractTransformerModelInstance>> model_instances;
create_model_instances(model_instances);
//tokenize
int theme_index = i % g_themes.size();
std::string prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\\n请你扮演一个人工智能助手,回答用户的问题。\\n\\n\\nHuman:请写一篇500字的作文,题目为" + g_themes.at(theme_index) + "\\n\\nAssistant:";
std::vector<int> input_tokens = g_tokenizer->Encode(prompt);
int real_size = input_tokens.size();
// step 4: prepare request
param.request_id = thread_id * repeat + i;
std::vector<void*> h_pointer_record;
std::vector<void*> d_pointer_record;
std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
prepare_request(param, input_tokens, &h_pointer_record, &d_pointer_record);
//printf("[INFO] request is created \n");
auto start = std::chrono::high_resolution_clock::now();
// step 5: Forward
std::vector<std::thread> threads;
std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> output_tensors_lists((size_t)g_gpu_count);
threads.clear();
for (int device_id = 0; device_id < g_gpu_count; device_id++) {
threads.push_back(std::thread(threadForward,
&model_instances[device_id],
request_list[device_id],
&output_tensors_lists[device_id],
device_id,
instance_comm.get()));
}
for (auto& t : threads) {
t.join();
}
auto end = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration<float, std::milli>(end - start);
std::vector<int> seq_lens;
std::vector<int> hBuf;
extract_output(output_tensors_lists[0], hBuf, seq_lens);
std::vector<int> output_tokens;
int output_size = hBuf.size();
for (int i = real_size; i < output_size; i++) {
if (hBuf[i] == 0) {
break;
}
output_tokens.push_back(hBuf[i]);
}
std::string output_words = g_tokenizer->Decode(output_tokens);
{
std::lock_guard<std::mutex> lock(ios_mutex);
std::cout << "/////////////////////////////////////////////////////////////////////////" << std::endl
<< "request_id:" << param.request_id << std::endl
<< "input tokens:" << real_size << std::endl
<< "output tokens:" << output_tokens.size() << std::endl
<< "forward time:" << dur.count() << std::endl
<< "speed:" << output_tokens.size() * 1000 / dur.count() << std::endl;
}
//free pointer
free_pointer(h_pointer_record, d_pointer_record);
}
}
int main(int argc, char* argv[]) {
std::string ini_name = argc >= 2 ? std::string(argv[1]) : "../examples/cpp/llama/llama_config.ini";
init_tokenizer(ini_name);
printf("-------------------------------init tokenizer success\n");
init_model(ini_name);
printf("-------------------------------init model success\n");
RequestParam param;
get_request_param(ini_name, param);
printf("-------------------------------get request param success\n");
int thread_num = 1;
int repeat_num = 1;
get_loop_config(ini_name, thread_num, repeat_num);
printf("thread_num=%d repeat_num=%d\n", thread_num, repeat_num);
std::vector<std::thread> threads;
threads.clear();
for (int i = 0; i < thread_num; i++) {
threads.push_back(std::thread(test_worker, param, i, repeat_num));
}
for (std::thread& thread : threads) {
thread.join();
}
if (kUSE_MPI) {
ft::mpi::barrier();
}
cudaDeviceSynchronize();
g_single_model_instances.clear();
if (kUSE_MPI) {
ft::mpi::finalize();
}
return 0;
} 配置文件
|
Checklist
Describe the bug
我这边依赖LlamaTritionBackend、TransformerTritonBackend模块封装了一个C接口
运行流程与examples/cpp/llama/llama_triton_example.cc基本一致。
然后通过一个测试代码测试该接口性能,测试环境Ubuntu 20.04.6,单节点未使用MPI,单卡A800,驱动版本515.105.01,CUDA版本11.7。
测试Llama2-7B模型时32路并发运行稳定,测试Llama2-13B模型时并发数一高(大概到8路并发左右就会高概率出现,测试不充分,不确定)就会偶发性报错,主要出现两个错误:
gdb traceback:
1.
2.
都是内部线程进行CUDA运算时报错,是我哪里设置有错误吗?
Reproduction
大概代码:
void on_thread(){
}
int main() {
createLlamaModel()
}
Error traceback
No response
The text was updated successfully, but these errors were encountered: