From 7b8a84113b6959cf3b6df3e828f919eb74591f92 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Tue, 26 Nov 2024 17:10:28 +0800 Subject: [PATCH] refactor --- CMakeLists.txt | 2 +- lmdeploy/turbomind/turbomind.py | 8 +- src/turbomind/models/llama/LlamaBatch.h | 4 +- src/turbomind/models/llama/LlamaV2.h | 6 +- src/turbomind/python/bind.cpp | 214 ++++++------- .../triton_backend/llama/LlamaTritonModel.cc | 191 ++++++------ .../triton_backend/llama/LlamaTritonModel.h | 70 ++--- .../llama/LlamaTritonModelInstance.cc | 206 +++++-------- .../llama/LlamaTritonModelInstance.h | 36 +-- .../transformer_triton_backend.cpp | 52 ++-- .../transformer_triton_backend.hpp | 283 ++---------------- src/turbomind/utils/Tensor.h | 10 + src/turbomind/utils/instance_comm.h | 16 - 13 files changed, 386 insertions(+), 712 deletions(-) delete mode 100644 src/turbomind/utils/instance_comm.h diff --git a/CMakeLists.txt b/CMakeLists.txt index ff2ac7dded..356da56f58 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -304,7 +304,7 @@ link_directories( # add_subdirectory(3rdparty) add_subdirectory(src) -add_subdirectory(examples) +# add_subdirectory(examples) if(BUILD_TEST) add_subdirectory(tests/csrc) diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 05bc3e400e..a1b2fff944 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -358,12 +358,10 @@ def _forward_callback(self, result, ctx): self.que.put((False, result)) def _forward_thread(self, inputs): - instance_comm = self.tm_model.model_comm.create_instance_comm( - self.gpu_count) def _func(): try: - output = self.model_inst.forward(inputs, instance_comm) + output = self.model_inst.forward(inputs) except Exception as e: logger.error(f'unhandled exception: {e}') self.que.put((-1, None)) @@ -377,12 +375,10 @@ def _async_forward_callback(self, result, ctx, que: LifoQueue): que.put((False, result)) def _async_forward_thread(self, inputs, que: LifoQueue): - instance_comm = self.tm_model.model_comm.create_instance_comm( - self.gpu_count) def _func(): try: - output = self.model_inst.forward(inputs, instance_comm) + output = self.model_inst.forward(inputs) except Exception as e: logger.error(f'unhandled exception: {e}') que.put((-1, None)) diff --git a/src/turbomind/models/llama/LlamaBatch.h b/src/turbomind/models/llama/LlamaBatch.h index 9c66948999..f952da6bae 100644 --- a/src/turbomind/models/llama/LlamaBatch.h +++ b/src/turbomind/models/llama/LlamaBatch.h @@ -12,7 +12,6 @@ #include "src/turbomind/utils/allocator.h" #include "src/turbomind/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/instance_comm.h" #include #include #include @@ -32,8 +31,7 @@ struct SharedState { }; struct Control { - AbstractInstanceComm* comm; - Request::Callback callback; + Request::Callback callback; }; struct BatchState { diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h index 6321d09d7c..a32183d41b 100644 --- a/src/turbomind/models/llama/LlamaV2.h +++ b/src/turbomind/models/llama/LlamaV2.h @@ -21,6 +21,9 @@ #pragma once +#include +#include + #include "src/turbomind/layers/DynamicDecodeLayer.h" #include "src/turbomind/models/llama/Barrier.h" #include "src/turbomind/models/llama/LlamaBatch.h" @@ -31,10 +34,7 @@ #include "src/turbomind/models/llama/unified_decoder.h" #include "src/turbomind/utils/allocator.h" #include "src/turbomind/utils/cublasMMWrapper.h" -#include "src/turbomind/utils/instance_comm.h" #include "src/turbomind/utils/nccl_utils.h" -#include -#include namespace turbomind { diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index 4eb34249ff..da20b94837 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -1,34 +1,38 @@ // Copyright (c) OpenMMLab. All rights reserved. -#include "src/turbomind/python/dlpack.h" -#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h" -#include "src/turbomind/triton_backend/transformer_triton_backend.hpp" -#include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/nccl_utils.h" -#include #include +#include + +#include + #include #include #include #include #include -#include + +#include "src/turbomind/python/dlpack.h" +#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h" +#include "src/turbomind/triton_backend/transformer_triton_backend.hpp" +#include "src/turbomind/utils/Tensor.h" +#include "src/turbomind/utils/cuda_utils.h" +#include "src/turbomind/utils/nccl_utils.h" namespace py = pybind11; namespace ft = turbomind; using namespace pybind11::literals; // prepare to bind container -using TensorVector = std::vector; +using TensorVector = std::vector; PYBIND11_MAKE_OPAQUE(TensorVector); -using TensorMap = std::unordered_map; +using TensorMap = std::unordered_map; PYBIND11_MAKE_OPAQUE(TensorMap); static const char kDlTensorCapsuleName[] = "dltensor"; -DLDevice getDLDevice(triton::Tensor& tensor) +DLDevice getDLDevice(ft::Tensor& tensor) { int device_id = 0; - if (tensor.where == triton::MEMORY_GPU) { + if (tensor.where == ft::MEMORY_GPU) { cudaPointerAttributes ptr_attr; cudaPointerGetAttributes(&ptr_attr, tensor.data); device_id = ptr_attr.device; @@ -37,13 +41,13 @@ DLDevice getDLDevice(triton::Tensor& tensor) DLDevice device{kDLCPU, device_id}; switch (tensor.where) { - case triton::MEMORY_CPU: + case ft::MEMORY_CPU: device.device_type = DLDeviceType::kDLCPU; break; - case triton::MEMORY_CPU_PINNED: + case ft::MEMORY_CPU_PINNED: device.device_type = DLDeviceType::kDLCUDAHost; break; - case triton::MEMORY_GPU: + case ft::MEMORY_GPU: device.device_type = DLDeviceType::kDLCUDA; break; default: @@ -53,62 +57,62 @@ DLDevice getDLDevice(triton::Tensor& tensor) return device; } -DLManagedTensor* TritonTensorToDLManagedTensor(triton::Tensor& tensor) +DLManagedTensor* TritonTensorToDLManagedTensor(ft::Tensor& tensor) { DLDevice device = getDLDevice(tensor); DLDataType data_type{0, 0, 1}; switch (tensor.type) { - case triton::TYPE_BOOL: + case ft::TYPE_BOOL: data_type.code = DLDataTypeCode::kDLBool; data_type.bits = 8; break; - case triton::TYPE_UINT8: + case ft::TYPE_UINT8: data_type.code = DLDataTypeCode::kDLUInt; data_type.bits = 8; break; - case triton::TYPE_UINT16: + case ft::TYPE_UINT16: data_type.code = DLDataTypeCode::kDLUInt; data_type.bits = 16; break; - case triton::TYPE_UINT32: + case ft::TYPE_UINT32: data_type.code = DLDataTypeCode::kDLUInt; data_type.bits = 32; break; - case triton::TYPE_UINT64: + case ft::TYPE_UINT64: data_type.code = DLDataTypeCode::kDLUInt; data_type.bits = 64; break; - case triton::TYPE_INT8: - case triton::TYPE_BYTES: + case ft::TYPE_INT8: + case ft::TYPE_BYTES: data_type.code = DLDataTypeCode::kDLInt; data_type.bits = 8; break; - case triton::TYPE_INT16: + case ft::TYPE_INT16: data_type.code = DLDataTypeCode::kDLInt; data_type.bits = 16; break; - case triton::TYPE_INT32: + case ft::TYPE_INT32: data_type.code = DLDataTypeCode::kDLInt; data_type.bits = 32; break; - case triton::TYPE_INT64: + case ft::TYPE_INT64: data_type.code = DLDataTypeCode::kDLInt; data_type.bits = 64; break; - case triton::TYPE_FP16: + case ft::TYPE_FP16: data_type.code = DLDataTypeCode::kDLFloat; data_type.bits = 16; break; - case triton::TYPE_FP32: + case ft::TYPE_FP32: data_type.code = DLDataTypeCode::kDLFloat; data_type.bits = 32; break; - case triton::TYPE_FP64: + case ft::TYPE_FP64: data_type.code = DLDataTypeCode::kDLFloat; data_type.bits = 64; break; - case triton::TYPE_BF16: + case ft::TYPE_BF16: data_type.code = DLDataTypeCode::kDLBfloat; data_type.bits = 16; break; @@ -125,78 +129,78 @@ DLManagedTensor* TritonTensorToDLManagedTensor(triton::Tensor& tensor) return new DLManagedTensor{dl_tensor, nullptr, [](DLManagedTensor* dlmt) { delete dlmt; }}; } -triton::MemoryType getMemoryType(DLDevice device) +ft::MemoryType getMemoryType(DLDevice device) { switch (device.device_type) { case DLDeviceType::kDLCUDAHost: - return triton::MemoryType::MEMORY_CPU_PINNED; + return ft::MemoryType::MEMORY_CPU_PINNED; case DLDeviceType::kDLCUDA: - return triton::MemoryType::MEMORY_GPU; + return ft::MemoryType::MEMORY_GPU; case DLDeviceType::kDLCPU: default: - return triton::MemoryType::MEMORY_CPU; + return ft::MemoryType::MEMORY_CPU; } } -triton::DataType getDataType(DLDataType data_type) +ft::DataType getDataType(DLDataType data_type) { switch (data_type.code) { case DLDataTypeCode::kDLUInt: switch (data_type.bits) { case 8: - return triton::TYPE_UINT8; + return ft::TYPE_UINT8; case 16: - return triton::TYPE_UINT16; + return ft::TYPE_UINT16; case 32: - return triton::TYPE_UINT32; + return ft::TYPE_UINT32; case 64: - return triton::TYPE_UINT64; + return ft::TYPE_UINT64; default: - return triton::TYPE_INVALID; + return ft::TYPE_INVALID; } break; case DLDataTypeCode::kDLInt: switch (data_type.bits) { case 8: - return triton::TYPE_INT8; + return ft::TYPE_INT8; case 16: - return triton::TYPE_INT16; + return ft::TYPE_INT16; case 32: - return triton::TYPE_INT32; + return ft::TYPE_INT32; case 64: - return triton::TYPE_INT64; + return ft::TYPE_INT64; default: - return triton::TYPE_INVALID; + return ft::TYPE_INVALID; } break; case DLDataTypeCode::kDLFloat: switch (data_type.bits) { case 16: - return triton::TYPE_FP16; + return ft::TYPE_FP16; case 32: - return triton::TYPE_FP32; + return ft::TYPE_FP32; case 64: - return triton::TYPE_FP64; + return ft::TYPE_FP64; default: - return triton::TYPE_INVALID; + return ft::TYPE_INVALID; } break; case DLDataTypeCode::kDLBfloat: switch (data_type.bits) { case 16: - return triton::TYPE_BF16; + return ft::TYPE_BF16; default: - return triton::TYPE_INVALID; + return ft::TYPE_INVALID; } break; case DLDataTypeCode::kDLBool: - return triton::TYPE_BOOL; + return ft::TYPE_BOOL; default: - return triton::TYPE_INVALID; + return ft::TYPE_INVALID; } } -std::shared_ptr DLManagedTensorToTritonTensor(DLManagedTensor* tensor) +std::shared_ptr DLManagedTensorToTritonTensor(DLManagedTensor* tensor) { auto& dl_tensor = tensor->dl_tensor; auto where = getMemoryType(dl_tensor.device); @@ -205,7 +209,7 @@ std::shared_ptr DLManagedTensorToTritonTensor(DLManagedTensor* t std::vector shape(dl_tensor.shape, dl_tensor.shape + dl_tensor.ndim); auto data = dl_tensor.data; - return std::make_shared(where, dtype, shape, data); + return std::make_shared(where, dtype, shape, data); } DLTensor GetDLTensor(py::object obj) @@ -225,70 +229,65 @@ PYBIND11_MODULE(_turbomind, m) // custom comm py::class_>(m, "AbstractCustomComm"); - // instance comm - py::class_(m, "AbstractInstanceComm"); - // data type - py::enum_(m, "DataType") - .value("TYPE_INVALID", triton::DataType::TYPE_INVALID) - .value("TYPE_BOOL", triton::DataType::TYPE_BOOL) - .value("TYPE_UINT8", triton::DataType::TYPE_UINT8) - .value("TYPE_UINT16", triton::DataType::TYPE_UINT16) - .value("TYPE_UINT32", triton::DataType::TYPE_UINT32) - .value("TYPE_UINT64", triton::DataType::TYPE_UINT64) - .value("TYPE_INT8", triton::DataType::TYPE_INT8) - .value("TYPE_INT16", triton::DataType::TYPE_INT16) - .value("TYPE_INT32", triton::DataType::TYPE_INT32) - .value("TYPE_INT64", triton::DataType::TYPE_INT64) - .value("TYPE_FP16", triton::DataType::TYPE_FP16) - .value("TYPE_FP32", triton::DataType::TYPE_FP32) - .value("TYPE_FP64", triton::DataType::TYPE_FP64) - .value("TYPE_BYTES", triton::DataType::TYPE_BYTES) - .value("TYPE_BF16", triton::DataType::TYPE_BF16); + py::enum_(m, "DataType") + .value("TYPE_INVALID", ft::DataType::TYPE_INVALID) + .value("TYPE_BOOL", ft::DataType::TYPE_BOOL) + .value("TYPE_UINT8", ft::DataType::TYPE_UINT8) + .value("TYPE_UINT16", ft::DataType::TYPE_UINT16) + .value("TYPE_UINT32", ft::DataType::TYPE_UINT32) + .value("TYPE_UINT64", ft::DataType::TYPE_UINT64) + .value("TYPE_INT8", ft::DataType::TYPE_INT8) + .value("TYPE_INT16", ft::DataType::TYPE_INT16) + .value("TYPE_INT32", ft::DataType::TYPE_INT32) + .value("TYPE_INT64", ft::DataType::TYPE_INT64) + .value("TYPE_FP16", ft::DataType::TYPE_FP16) + .value("TYPE_FP32", ft::DataType::TYPE_FP32) + .value("TYPE_FP64", ft::DataType::TYPE_FP64) + .value("TYPE_BYTES", ft::DataType::TYPE_BYTES) + .value("TYPE_BF16", ft::DataType::TYPE_BF16); // memory type - py::enum_(m, "MemoryType") - .value("MEMORY_CPU", triton::MemoryType::MEMORY_CPU) - .value("MEMORY_CPU_PINNED", triton::MemoryType::MEMORY_CPU_PINNED) - .value("MEMORY_GPU", triton::MemoryType::MEMORY_GPU); + py::enum_(m, "MemoryType") + .value("MEMORY_CPU", ft::MemoryType::MEMORY_CPU) + .value("MEMORY_CPU_PINNED", ft::MemoryType::MEMORY_CPU_PINNED) + .value("MEMORY_GPU", ft::MemoryType::MEMORY_GPU); // tensor - py::class_>(m, "Tensor") - .def_readonly("where", &triton::Tensor::where) - .def_readonly("type", &triton::Tensor::type) - .def_readonly("shape", &triton::Tensor::shape) - .def_readonly("data", &triton::Tensor::data) - .def(py::init([](const triton::MemoryType where, - const triton::DataType type, - const std::vector& shape, - const long data) { - auto data_ptr = reinterpret_cast(data); - return new triton::Tensor(where, type, shape, data_ptr); - })) + py::class_>(m, "Tensor") + .def_readonly("where", &ft::Tensor::where) + .def_readonly("type", &ft::Tensor::type) + .def_readonly("shape", &ft::Tensor::shape) + .def_readonly("data", &ft::Tensor::data) + .def(py::init( + [](const ft::MemoryType where, const ft::DataType type, const std::vector& shape, const long data) { + auto data_ptr = reinterpret_cast(data); + return new ft::Tensor(where, type, shape, data_ptr); + })) .def( "view", - [](triton::Tensor* self, triton::DataType new_type) { - return new triton::Tensor(self->where, new_type, self->shape, self->data); + [](ft::Tensor* self, ft::DataType new_type) { + return new ft::Tensor(self->where, new_type, self->shape, self->data); }, "new_type"_a) .def( "view", - [](triton::Tensor* self, std::vector new_shape) { - return new triton::Tensor(self->where, self->type, new_shape, self->data); + [](ft::Tensor* self, std::vector new_shape) { + return new ft::Tensor(self->where, self->type, new_shape, self->data); }, "new_shape"_a) .def( "copy_from", - [](triton::Tensor* self, py::object obj) { + [](ft::Tensor* self, py::object obj) { py::capsule cap = obj.attr("__dlpack__")(); DLManagedTensor* dlmt = static_cast(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName)); auto src = DLManagedTensorToTritonTensor(dlmt); switch (self->type) { - case triton::TYPE_FP16: - case triton::TYPE_FP32: - case triton::TYPE_INT32: - case triton::TYPE_BF16: { + case ft::TYPE_FP16: + case ft::TYPE_FP32: + case ft::TYPE_INT32: + case ft::TYPE_BF16: { auto num_element = std::accumulate(src->shape.begin(), src->shape.end(), 1LL, std::multiplies()); auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8; @@ -304,7 +303,7 @@ PYBIND11_MODULE(_turbomind, m) "tensor"_a) .def( "__dlpack__", - [](triton::Tensor* self, long stream) { + [](ft::Tensor* self, long stream) { DLManagedTensor* dlmt = TritonTensorToDLManagedTensor(*self); return py::capsule(dlmt, kDlTensorCapsuleName, [](PyObject* obj) { DLManagedTensor* dlmt = @@ -320,7 +319,7 @@ PYBIND11_MODULE(_turbomind, m) }); }, "stream"_a = 0) - .def("__dlpack_device__", [](triton::Tensor* self) { + .def("__dlpack_device__", [](ft::Tensor* self) { auto device = getDLDevice(*self); return std::tuple(int(device.device_type), device.device_id); }); @@ -336,19 +335,19 @@ PYBIND11_MODULE(_turbomind, m) "dl_managed_tensor"_a); // transformer model instance + using ft::AbstractTransformerModelInstance; py::bind_map>(m, "TensorMap"); py::class_(m, "AbstractTransformerModelInstance") .def( "forward", - [](AbstractTransformerModelInstance* model, - std::shared_ptr input_tensors, - ft::AbstractInstanceComm* inst_comm) { return model->forward(input_tensors, inst_comm); }, + [](AbstractTransformerModelInstance* model, std::shared_ptr input_tensors) { + return model->forward(input_tensors); + }, py::call_guard(), - "input_tensors"_a, - "inst_comm"_a = nullptr) + "input_tensors"_a) .def( "register_callback", - [](AbstractTransformerModelInstance* self, triton_stream_cb_t cb, py::object ctx) { + [](AbstractTransformerModelInstance* self, ft::triton_stream_cb_t cb, py::object ctx) { self->registerCallback(cb, ctx.ptr()); }, "callback"_a, @@ -356,6 +355,8 @@ PYBIND11_MODULE(_turbomind, m) .def("unregister_callback", &AbstractTransformerModelInstance::unRegisterCallback); // transformer model + using ft::AbstractTransformerModel; + using ft::LlamaTritonModel; py::class_>(m, "AbstractTransformerModel") .def_static( "create_llama_model", @@ -419,7 +420,6 @@ PYBIND11_MODULE(_turbomind, m) return ret; }, "world_size"_a) - .def("create_instance_comm", &AbstractTransformerModel::createInstanceComm, "size"_a) .def( "create_model_instance", [](AbstractTransformerModel* model, diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index 2deca46380..aab9287762 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -27,17 +27,18 @@ #include "src/turbomind/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/context.h" #include "src/turbomind/models/llama/llama_params.h" +#include "src/turbomind/utils/allocator.h" +#include "src/turbomind/utils/cuda_utils.h" + #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h" #include "src/turbomind/triton_backend/transformer_triton_backend.hpp" -#include "src/turbomind/utils/allocator.h" -#include "src/turbomind/utils/cuda_utils.h" -namespace ft = turbomind; +namespace turbomind { -static std::optional get_moe_method() +static std::optional get_moe_method() { - static const auto value = []() -> std::optional { + static const auto value = []() -> std::optional { const auto p = std::getenv("TM_MOE_METHOD"); if (p) { std::string str(p); @@ -45,10 +46,10 @@ static std::optional get_moe_method() x = std::tolower(x); } if (str == "naive") { - return ft::MoeParam::kNaive; + return MoeParam::kNaive; } else if (str == "fused") { - return ft::MoeParam::kFused; + return MoeParam::kFused; } else { std::cerr << "[WARNING] unrecognised MoE method: " << str << "\n"; @@ -67,7 +68,7 @@ std::shared_ptr AbstractTransformerModel::createLlamaM } catch (const YAML::Exception& e) { std::cerr << "Error reading YAML config: " << e.what() << std::endl; - ft::FT_CHECK(false); + FT_CHECK(false); } const auto ft_instance_hyperparameter = reader["ft_instance_hyperparameter"]; @@ -91,7 +92,7 @@ std::shared_ptr AbstractTransformerModel::createLlamaM model_dir); #else TM_LOG_ERROR("[ERROR] Turbomind is not built with ENABLE_BF16"); - ft::FT_CHECK(false); + FT_CHECK(false); #endif } else { @@ -103,7 +104,7 @@ std::shared_ptr AbstractTransformerModel::createLlamaM model_dir); #else TM_LOG_ERROR("[ERROR] Turbomind is not built with ENABLE_BF32"); - ft::FT_CHECK(false); + FT_CHECK(false); #endif } return nullptr; @@ -205,10 +206,10 @@ void LlamaTritonModel::handleMissingParams() template LlamaTritonModel::~LlamaTritonModel() { - ft::FT_CHECK(weights_.size() == engines_.size()); + FT_CHECK(weights_.size() == engines_.size()); for (int device_id = 0; device_id < (int)engines_.size(); ++device_id) { // Set device id before destructing CUDA resources - ft::check_cuda_error(cudaSetDevice(device_id)); + check_cuda_error(cudaSetDevice(device_id)); engines_[device_id].reset(); weights_[device_id].reset(); } @@ -222,7 +223,7 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, std::string config): tensor_para_size_(tensor_para_size), pipeline_para_size_(pipeline_para_size), - weights_(ft::getDeviceCount()), + weights_(getDeviceCount()), enable_custom_all_reduce_(enable_custom_all_reduce) { FT_CHECK_WITH_INFO(!(config.empty() && model_dir.empty()), "invalid init options"); @@ -242,7 +243,7 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, } catch (const YAML::Exception& e) { std::cerr << "Error reading YAML config: " << e.what() << std::endl; - ft::FT_CHECK(false); + FT_CHECK(false); } const auto model_reader = reader["model_config"]; @@ -297,7 +298,7 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, engine_param_.num_tokens_per_iter = engine_reader["num_tokens_per_iter"].as(0); engine_param_.max_prefill_iters = engine_reader["max_prefill_iters"].as(1); - lora_param_.policy = ft::getLoraPolicy(reader["lora_config"]["lora_policy"].as("")); + lora_param_.policy = getLoraPolicy(reader["lora_config"]["lora_policy"].as("")); lora_param_.r = lora_reader["lora_r"].as(0); lora_param_.scale = lora_reader["lora_scale"].as(0); lora_param_.max_wo_r = lora_reader["lora_max_wo_r"].as(0); @@ -313,75 +314,75 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, handleMissingParams(); - shared_state_ = std::make_shared(); - shared_state_->barrier = std::make_shared(tensor_para_size); + shared_state_ = std::make_shared(); + shared_state_->barrier = std::make_shared(tensor_para_size); - const auto device_count = ft::getDeviceCount(); + const auto device_count = getDeviceCount(); engines_.resize(device_count); const std::string weight_type_str = model_reader["weight_type"].as(); if (weight_type_str == "fp16" || weight_type_str == "float16") { - weight_type_ = ft::WeightType::kFP16; + weight_type_ = WeightType::kFP16; } else if (weight_type_str == "bf16" || weight_type_str == "bfloat16") { - weight_type_ = ft::WeightType::kBF16; + weight_type_ = WeightType::kBF16; } else if (weight_type_str == "fp32") { - weight_type_ = ft::WeightType::kFP32; + weight_type_ = WeightType::kFP32; } else if (weight_type_str == "int8") { - weight_type_ = ft::WeightType::kINT8; + weight_type_ = WeightType::kINT8; } else if (weight_type_str == "int4") { - weight_type_ = ft::WeightType::kINT4; + weight_type_ = WeightType::kINT4; } else { std::cout << "[ERROR] Unsupported weight type: '" << weight_type_str << "'\n"; - ft::FT_CHECK(0); + FT_CHECK(0); } if (auto method = get_moe_method()) { moe_param_.method = *method; } else { - moe_param_.method = ft::MoeParam::kFused; + moe_param_.method = MoeParam::kFused; } TM_LOG_INFO("%s", toString().c_str()); } template -std::unique_ptr> LlamaTritonModel::createSharedModelInstance( - int device_id, - int rank, - std::pair, std::vector> nccl_params, - std::shared_ptr custom_all_reduce_comm) +std::unique_ptr> +LlamaTritonModel::createSharedModelInstance(int device_id, + int rank, + std::pair, std::vector> nccl_params, + std::shared_ptr custom_all_reduce_comm) { - ft::check_cuda_error(cudaSetDevice(device_id)); + check_cuda_error(cudaSetDevice(device_id)); const int comms_rank = device_id % (tensor_para_size_ * pipeline_para_size_); - auto ctx = std::make_unique>(device_id); + auto ctx = std::make_unique>(device_id); - ft::NcclParam tensor_para = nccl_params.first[comms_rank]; - ft::NcclParam pipeline_para = nccl_params.second[comms_rank]; + NcclParam tensor_para = nccl_params.first[comms_rank]; + NcclParam pipeline_para = nccl_params.second[comms_rank]; - ft::FT_CHECK(tensor_para.world_size_ == tensor_para_size_); - ft::FT_CHECK(pipeline_para.world_size_ == pipeline_para_size_); + FT_CHECK(tensor_para.world_size_ == tensor_para_size_); + FT_CHECK(pipeline_para.world_size_ == pipeline_para_size_); - auto model = std::make_unique>(model_param_, // - attn_param_, - moe_param_, - lora_param_, - tensor_para, - *ctx, - engine_param_.max_batch_size, - weights_[device_id]); + auto model = std::make_unique>(model_param_, // + attn_param_, + moe_param_, + lora_param_, + tensor_para, + *ctx, + engine_param_.max_batch_size, + weights_[device_id]); - auto engine = std::make_unique>(engine_param_, // - std::move(model), - std::move(ctx), - shared_state_, - device_id); + auto engine = std::make_unique>(engine_param_, // + std::move(model), + std::move(ctx), + shared_state_, + device_id); // Wait for pinned buffers to be allocated for all ranks, otherwise tuning will hang // due to concurrent kernel launch & cudaMallocHost @@ -397,14 +398,14 @@ std::unique_ptr LlamaTritonModel::createModelInstance(int device_id, int rank, cudaStream_t stream, - std::pair, std::vector>, - std::shared_ptr) + std::pair, std::vector>, + std::shared_ptr) { - ft::check_cuda_error(cudaSetDevice(device_id)); + check_cuda_error(cudaSetDevice(device_id)); - ft::FT_CHECK(engines_[device_id] != nullptr); + FT_CHECK(engines_[device_id] != nullptr); - auto allocator = std::make_unique>(device_id, false); + auto allocator = std::make_unique>(device_id, false); allocator->setStream(stream); @@ -414,25 +415,25 @@ LlamaTritonModel::createModelInstance(int device_id, template void LlamaTritonModel::createSharedWeights(int device_id, int rank) { - ft::check_cuda_error(cudaSetDevice(device_id)); + check_cuda_error(cudaSetDevice(device_id)); const int tensor_para_rank = rank % tensor_para_size_; const int pipeline_para_rank = rank / tensor_para_size_; - ft::FT_CHECK(pipeline_para_size_ == 1 && pipeline_para_rank == 0); - weights_[device_id] = std::make_shared>(model_param_.head_num, - model_param_.kv_head_num, - model_param_.head_dim, - model_param_.hidden_units, - model_param_.inter_size, - model_param_.vocab_size, - model_param_.embedding_size, - model_param_.layer_num, - attn_bias_, - weight_type_, - group_size_, - lora_param_, - moe_param_, - tensor_para_size_, - tensor_para_rank); + FT_CHECK(pipeline_para_size_ == 1 && pipeline_para_rank == 0); + weights_[device_id] = std::make_shared>(model_param_.head_num, + model_param_.kv_head_num, + model_param_.head_dim, + model_param_.hidden_units, + model_param_.inter_size, + model_param_.vocab_size, + model_param_.embedding_size, + model_param_.layer_num, + attn_bias_, + weight_type_, + group_size_, + lora_param_, + moe_param_, + tensor_para_size_, + tensor_para_rank); // model inited with model_dir if (model_dir_ != "") { weights_[device_id]->loadModel(model_dir_); @@ -441,37 +442,41 @@ void LlamaTritonModel::createSharedWeights(int device_id, int rank) } template -TensorMap LlamaTritonModel::getParams(int deviceId, int rank) +std::unordered_map LlamaTritonModel::getParams(int deviceId, int rank) { - ft::check_cuda_error(cudaSetDevice(deviceId)); + check_cuda_error(cudaSetDevice(deviceId)); + // shared_weight should be created before getParams - ft::FT_CHECK(weights_[deviceId] != nullptr); - ft::TensorMap output = weights_[deviceId]->getParams(); - TensorMap result; + FT_CHECK(weights_[deviceId] != nullptr); + + TensorMap output = weights_[deviceId]->getParams(); + + std::unordered_map result; for (auto [name, tensor] : output) { - result.emplace(name, triton::Tensor{tensor.where, tensor.type, tensor.shape, tensor.data}); + result.insert({{name, Tensor{tensor.where, tensor.type, tensor.shape, tensor.data}}}); } + return result; } template void LlamaTritonModel::processWeights(int device_id, int rank) { - ft::check_cuda_error(cudaSetDevice(device_id)); - ft::FT_CHECK(weights_[device_id] != nullptr); + check_cuda_error(cudaSetDevice(device_id)); + FT_CHECK(weights_[device_id] != nullptr); cudaDeviceProp props{}; - ft::check_cuda_error(cudaGetDeviceProperties(&props, device_id)); + check_cuda_error(cudaGetDeviceProperties(&props, device_id)); weights_[device_id]->prepare(props); - ft::sync_check_cuda_error(); + sync_check_cuda_error(); } template -void LlamaTritonModel::createEngine(int device_id, - int rank, - std::pair, std::vector> nccl_params, - std::shared_ptr custom_all_reduce_comm) +void LlamaTritonModel::createEngine(int device_id, + int rank, + std::pair, std::vector> nccl_params, + std::shared_ptr custom_all_reduce_comm) { auto engine = createSharedModelInstance(device_id, rank, nccl_params, custom_all_reduce_comm); @@ -509,17 +514,11 @@ std::string LlamaTritonModel::toString() } template -void LlamaTritonModel::createCustomComms( - std::vector>* custom_all_reduce_comms, int world_size) +void LlamaTritonModel::createCustomComms(std::vector>* custom_all_reduce_comms, + int world_size) { - using commDataType = typename ft::CustomARCommTypeConverter::Type; - ft::initCustomAllReduceComm(custom_all_reduce_comms, enable_custom_all_reduce_, world_size); -} - -template -std::unique_ptr LlamaTritonModel::createInstanceComm(int size) -{ - return nullptr; + using commDataType = typename CustomARCommTypeConverter::Type; + initCustomAllReduceComm(custom_all_reduce_comms, enable_custom_all_reduce_, world_size); } template @@ -541,3 +540,5 @@ template struct LlamaTritonModel; #ifdef ENABLE_BF16 template struct LlamaTritonModel<__nv_bfloat16>; #endif + +} // namespace turbomind diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h index 19a143e721..b1d00a7cb6 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h @@ -31,7 +31,7 @@ #include #include -namespace ft = turbomind; +namespace turbomind { template struct LlamaTritonModel: public AbstractTransformerModel { @@ -44,27 +44,25 @@ struct LlamaTritonModel: public AbstractTransformerModel { ~LlamaTritonModel() override; std::unique_ptr - createModelInstance(int deviceId, - int rank, - cudaStream_t stream, - std::pair, std::vector> nccl_params, - std::shared_ptr custom_all_reduce_comm = nullptr) override; + createModelInstance(int deviceId, + int rank, + cudaStream_t stream, + std::pair, std::vector> nccl_params, + std::shared_ptr custom_all_reduce_comm = nullptr) override; void createSharedWeights(int deviceId, int rank) override; - TensorMap getParams(int deviceId, int rank) override; + std::unordered_map getParams(int deviceId, int rank) override; void processWeights(int deviceId, int rank) override; - void createEngine(int device_id, - int rank, - std::pair, std::vector> nccl_params, - std::shared_ptr) override; + void createEngine(int device_id, + int rank, + std::pair, std::vector> nccl_params, + std::shared_ptr) override; - void createCustomComms(std::vector>* custom_all_reduce_comms, - int world_size) override; - - std::unique_ptr createInstanceComm(int size) override; + void createCustomComms(std::vector>* custom_all_reduce_comms, + int world_size) override; void handleMissingParams(); @@ -78,27 +76,27 @@ struct LlamaTritonModel: public AbstractTransformerModel { int getPipelineParaSize() override; private: - std::unique_ptr> - createSharedModelInstance(int deviceId, - int rank, - std::pair, std::vector> nccl_params, - std::shared_ptr custom_all_reduce_comm = nullptr); - - ft::ModelParam model_param_; - ft::AttentionParam attn_param_; - ft::MoeParam moe_param_; - ft::LoraParam lora_param_; - ft::EngineParam engine_param_; - size_t tensor_para_size_; - size_t pipeline_para_size_; - ft::WeightType weight_type_; - bool attn_bias_; - int group_size_; - - std::shared_ptr shared_state_; + std::unique_ptr> + createSharedModelInstance(int deviceId, + int rank, + std::pair, std::vector> nccl_params, + std::shared_ptr custom_all_reduce_comm = nullptr); + + ModelParam model_param_; + AttentionParam attn_param_; + MoeParam moe_param_; + LoraParam lora_param_; + EngineParam engine_param_; + size_t tensor_para_size_; + size_t pipeline_para_size_; + WeightType weight_type_; + bool attn_bias_; + int group_size_; + + std::shared_ptr shared_state_; // Weights & engine instances for the ranks - std::vector>> weights_; - std::vector>> engines_; + std::vector>> weights_; + std::vector>> engines_; bool is_fp16_; int enable_custom_all_reduce_ = 0; @@ -108,3 +106,5 @@ struct LlamaTritonModel: public AbstractTransformerModel { ffi_api_lock_ctrl_t ffi_lock_ = nullptr; }; + +} // namespace turbomind diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc index 8221f932ce..976fc9cc1d 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc @@ -31,78 +31,23 @@ #include #include -namespace ft = turbomind; +namespace turbomind { template -void triton_stream_callback(std::unordered_map* output_tensors, void* ctx) +void triton_stream_callback(std::unordered_map* outputs, void* ctx) { - LlamaTritonModelInstance* model = reinterpret_cast*>(ctx); - auto result = LlamaTritonModelInstance::convert_outputs(*output_tensors); - - model->stream_cb_(result, model->stream_ctx_); + LlamaTritonModelInstance* model = reinterpret_cast*>(ctx); + model->stream_cb_(std::make_shared>(*outputs), model->stream_ctx_); } template -LlamaTritonModelInstance::LlamaTritonModelInstance(ft::Engine& instance, - std::unique_ptr> allocator, - int device_id): +LlamaTritonModelInstance::LlamaTritonModelInstance(Engine& instance, + std::unique_ptr> allocator, + int device_id): device_id_{device_id}, instance_(&instance), allocator_(std::move(allocator)) { } -template -std::unordered_map LlamaTritonModelInstance::convert_inputs( - std::shared_ptr> input_tensors) -{ - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - - const size_t request_batch_size = input_tensors->at("input_ids").shape[0]; - const size_t input_data_len = input_tensors->at("input_ids").shape[1]; - h_total_output_lengths_ = - (uint32_t*)std::realloc((void*)h_total_output_lengths_, request_batch_size * sizeof(uint32_t)); - - std::unordered_map ft_input_tensors{}; - - for (auto t = input_tensors->begin(); t != input_tensors->end(); ++t) { - if (ft_input_tensors.count(t->first) == 0) { - ft_input_tensors.insert({t->first, t->second.convertTritonTensorToFt()}); - } - } - - return ft_input_tensors; -} - -template -std::shared_ptr> -LlamaTritonModelInstance::convert_outputs(const std::unordered_map& output_tensors) -{ - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - std::unordered_map* outputs_mapping = - new std::unordered_map(); - - for (auto it = output_tensors.begin(); it != output_tensors.end(); it++) { - outputs_mapping->insert({it->first, triton::Tensor::convertFtTensorToTriton(it->second)}); - } - - return std::shared_ptr>(outputs_mapping); -} - -template -std::shared_ptr> -LlamaTritonModelInstance::forward(std::shared_ptr> input_tensors) -{ - ft::FT_CHECK(false); - return nullptr; -} - -template -std::shared_ptr> -LlamaTritonModelInstance::forward(std::shared_ptr> input_tensors) -{ - ft::FT_CHECK(false); - return nullptr; -} - template std::string format_vector(const std::vector& vec) { @@ -118,120 +63,109 @@ std::string format_vector(const std::vector& vec) } template -std::shared_ptr> -LlamaTritonModelInstance::forward(std::shared_ptr> input_tensors, - ft::AbstractInstanceComm* instance_comm) +std::shared_ptr> +LlamaTritonModelInstance::forward(std::shared_ptr> inputs) { TM_LOG_DEBUG(__PRETTY_FUNCTION__); // In some cases, this is needed to trigger the creation of CUDA context, or later `cudaMallocAsync` will die - ft::check_cuda_error(cudaSetDevice(device_id_)); + check_cuda_error(cudaSetDevice(device_id_)); - FT_CHECK_WITH_INFO(input_tensors->at("input_ids").shape.size() == 2, - "input_tensors->at(\"input_ids\").shape.size() == 2"); - FT_CHECK_WITH_INFO(input_tensors->at("input_lengths").shape.size() == 1, - "input_tensors->at(\"input_lengths\").shape.size() == 1"); + FT_CHECK_WITH_INFO(inputs->at("input_ids").shape.size() == 2, "inputs->at(\"input_ids\").shape.size() == 2"); + FT_CHECK_WITH_INFO(inputs->at("input_lengths").shape.size() == 1, + "inputs->at(\"input_lengths\").shape.size() == 1"); - const uint32_t request_batch_size = input_tensors->at("input_ids").shape[0]; - const uint32_t max_request_output_len = (size_t)*std::max_element( - (int*)input_tensors->at("request_output_len").data, - (int*)input_tensors->at("request_output_len").data + input_tensors->at("request_output_len").shape[0]); + const uint32_t request_batch_size = inputs->at("input_ids").shape[0]; + const uint32_t max_request_output_len = (size_t)*std::max_element((int*)inputs->at("request_output_len").data, + (int*)inputs->at("request_output_len").data + + inputs->at("request_output_len").shape[0]); // const uint32_t total_output_len = max_request_output_len + input_tensors->at("input_ids").shape[1]; - const uint32_t beam_width = - input_tensors->count("beam_width") ? (size_t)(*(uint*)input_tensors->at("beam_width").data) : 1; + const uint32_t beam_width = inputs->count("beam_width") ? (size_t)(*(uint*)inputs->at("beam_width").data) : 1; FT_CHECK_WITH_INFO(beam_width == 1, "Beam search is not implemented"); - std::unordered_map ft_input_tensors = convert_inputs(input_tensors); + h_total_output_lengths_ = + (uint32_t*)std::realloc((void*)h_total_output_lengths_, request_batch_size * sizeof(uint32_t)); - const size_t max_input_len = input_tensors->at("input_ids").shape[1]; - const bool is_return_logits = - input_tensors->count("is_return_logits") && *(bool*)input_tensors->at("is_return_logits").data; + const size_t max_input_len = inputs->at("input_ids").shape[1]; + const bool is_return_logits = inputs->count("is_return_logits") && *(bool*)inputs->at("is_return_logits").data; const size_t vocab_size = instance_->model().vocab_size(); allocateBuffer(request_batch_size, max_input_len, beam_width, instance_->session_len(), is_return_logits); - std::unordered_map output_tensors = std::unordered_map{ + std::unordered_map outputs{ {"output_ids", - ft::Tensor{ft::MEMORY_CPU, - ft::TYPE_UINT32, - std::vector{request_batch_size, beam_width, (size_t)instance_->session_len()}, - d_output_ids_}}, + Tensor{MEMORY_CPU, + TYPE_UINT32, + std::vector{request_batch_size, beam_width, (size_t)instance_->session_len()}, + d_output_ids_}}, {"sequence_length", - ft::Tensor{ft::MEMORY_CPU, - ft::TYPE_UINT32, - std::vector{request_batch_size, beam_width}, - d_sequence_lengths_}}}; - - if (input_tensors->count("is_return_log_probs") && *((bool*)input_tensors->at("is_return_log_probs").data)) { - output_tensors.insert({"output_log_probs", - ft::Tensor{ft::MEMORY_GPU, - ft::TYPE_FP32, - std::vector{request_batch_size, beam_width, max_request_output_len}, - d_output_log_probs_}}); - output_tensors.insert({"cum_log_probs", - ft::Tensor{ft::MEMORY_GPU, - ft::TYPE_FP32, - std::vector{request_batch_size, beam_width}, - d_cum_log_probs_}}); + Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{request_batch_size, beam_width}, d_sequence_lengths_}}}; + + if (inputs->count("is_return_log_probs") && *((bool*)inputs->at("is_return_log_probs").data)) { + outputs.insert({"output_log_probs", + Tensor{MEMORY_GPU, + TYPE_FP32, + std::vector{request_batch_size, beam_width, max_request_output_len}, + d_output_log_probs_}}); + outputs.insert( + {"cum_log_probs", + Tensor{MEMORY_GPU, TYPE_FP32, std::vector{request_batch_size, beam_width}, d_cum_log_probs_}}); } - if (input_tensors->count("logprobs")) { + if (inputs->count("logprobs")) { size_t max_logprob_length = std::min((int)max_request_output_len, instance_->session_len()) + 1; h_logprob_vals_ = (float*)std::realloc( - h_logprob_vals_, sizeof(float) * request_batch_size * beam_width * max_logprob_length * ft::kMaxLogProb); - h_logprob_indexes_ = (uint32_t*)std::realloc(h_logprob_indexes_, - sizeof(uint32_t) * request_batch_size * beam_width - * max_logprob_length * ft::kMaxLogProb); - h_logprob_nums_ = (uint32_t*)std::realloc( + h_logprob_vals_, sizeof(float) * request_batch_size * beam_width * max_logprob_length * kMaxLogProb); + h_logprob_indexes_ = (uint32_t*)std::realloc( + h_logprob_indexes_, sizeof(uint32_t) * request_batch_size * beam_width * max_logprob_length * kMaxLogProb); + h_logprob_nums_ = (uint32_t*)std::realloc( h_logprob_nums_, sizeof(uint32_t) * request_batch_size * beam_width * max_logprob_length); - output_tensors.insert( - {{"logprob_vals", - ft::Tensor{ft::MEMORY_CPU, - ft::TYPE_FP32, - std::vector{request_batch_size, beam_width, max_logprob_length, ft::kMaxLogProb}, - h_logprob_vals_}}}); - - output_tensors.insert( - {{"logprob_indexes", - ft::Tensor{ft::MEMORY_CPU, - ft::TYPE_UINT32, - std::vector{request_batch_size, beam_width, max_logprob_length, ft::kMaxLogProb}, - h_logprob_indexes_}}}); - - output_tensors.insert({{"logprob_nums", - ft::Tensor{ft::MEMORY_CPU, - ft::TYPE_UINT32, - std::vector{request_batch_size, beam_width, max_logprob_length}, - h_logprob_nums_}}}); + outputs.insert({{"logprob_vals", + Tensor{MEMORY_CPU, + TYPE_FP32, + std::vector{request_batch_size, beam_width, max_logprob_length, kMaxLogProb}, + h_logprob_vals_}}}); + + outputs.insert({{"logprob_indexes", + Tensor{MEMORY_CPU, + TYPE_UINT32, + std::vector{request_batch_size, beam_width, max_logprob_length, kMaxLogProb}, + h_logprob_indexes_}}}); + + outputs.insert({{"logprob_nums", + Tensor{MEMORY_CPU, + TYPE_UINT32, + std::vector{request_batch_size, beam_width, max_logprob_length}, + h_logprob_nums_}}}); } if (is_return_logits) { - output_tensors.insert( - {"logits", - {ft::MEMORY_GPU, ft::TYPE_FP32, {request_batch_size, max_input_len, vocab_size}, d_output_logits_}}); + outputs.insert( + {{"logits", {MEMORY_GPU, TYPE_FP32, {request_batch_size, max_input_len, vocab_size}, d_output_logits_}}}); } try { - ft::Request::Callback callback; + Request::Callback callback; if (stream_cb_) { - callback = [this](std::unordered_map* outputs) { + callback = [this](std::unordered_map* outputs) { triton_stream_callback(outputs, this); }; } - ft::check_cuda_error(cudaStreamSynchronize(allocator_->returnStream())); - instance_->Submit(&output_tensors, &ft_input_tensors, {instance_comm, callback}); + check_cuda_error(cudaStreamSynchronize(allocator_->returnStream())); + + instance_->Submit(&outputs, inputs.get(), {callback}); // ! stream synced by the model before returning } catch (...) { h_exception_ = std::current_exception(); - output_tensors.insert({"error_message", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_BYTES, {1}, &h_exception_}}); + outputs.insert({"error_message", Tensor{MEMORY_CPU, TYPE_BYTES, {1}, &h_exception_}}); } - return convert_outputs(output_tensors); + return std::make_shared>(std::move(outputs)); } template @@ -278,3 +212,5 @@ template struct LlamaTritonModelInstance; #ifdef ENABLE_BF16 template struct LlamaTritonModelInstance<__nv_bfloat16>; #endif + +} // namespace turbomind diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h index 08088c05d5..2cf69b9fa5 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h +++ b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h @@ -20,41 +20,29 @@ #pragma once +#include + #include "src/turbomind/models/llama/LlamaBatch.h" #include "src/turbomind/models/llama/LlamaV2.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h" #include "src/turbomind/triton_backend/transformer_triton_backend.hpp" -#include -namespace ft = turbomind; +namespace turbomind { template struct LlamaTritonModelInstance: AbstractTransformerModelInstance { - LlamaTritonModelInstance(ft::Engine& instance, - std::unique_ptr> allocator, - int device_id); - ~LlamaTritonModelInstance(); - - std::shared_ptr> - forward(std::shared_ptr> input_tensors) override; + LlamaTritonModelInstance(Engine& instance, + std::unique_ptr> allocator, + int device_id); + ~LlamaTritonModelInstance() override; - std::shared_ptr> - forward(std::shared_ptr> input_tensors) override; - - std::shared_ptr> - forward(std::shared_ptr> input_tensors, - ft::AbstractInstanceComm*) override; - - static std::shared_ptr> - convert_outputs(const std::unordered_map& output_tensors); + virtual std::shared_ptr> + forward(std::shared_ptr> input_tensors) override; private: - ft::Engine* instance_; - const std::unique_ptr> allocator_; - - std::unordered_map - convert_inputs(std::shared_ptr> input_tensors); + Engine* instance_; + const std::unique_ptr> allocator_; void allocateBuffer(const size_t request_batch_size, const size_t max_input_len, @@ -88,3 +76,5 @@ struct LlamaTritonModelInstance: AbstractTransformerModelInstance { uint32_t* h_total_output_lengths_ = nullptr; std::exception_ptr h_exception_ = nullptr; }; + +} // namespace turbomind diff --git a/src/turbomind/triton_backend/transformer_triton_backend.cpp b/src/turbomind/triton_backend/transformer_triton_backend.cpp index 16c64b17d5..acf5e06e88 100644 --- a/src/turbomind/triton_backend/transformer_triton_backend.cpp +++ b/src/turbomind/triton_backend/transformer_triton_backend.cpp @@ -21,62 +21,66 @@ #include "src/turbomind/triton_backend/transformer_triton_backend.hpp" #include "src/turbomind/utils/nccl_utils.h" -std::pair, std::vector> +namespace turbomind { + +std::pair, std::vector> AbstractTransformerModel::createNcclParams(const int node_id, const int device_id_start, const bool multi_node) { - const int gpu_count = ft::getDeviceCount(); + const int gpu_count = getDeviceCount(); const int tensor_para_size = getTensorParaSize(); const int pipeline_para_size = getPipelineParaSize(); const int local_comm_size = multi_node ? gpu_count : tensor_para_size * pipeline_para_size; - ft::FT_CHECK(tensor_para_size > 0 && pipeline_para_size > 0); - ft::FT_CHECK(device_id_start + (int)local_comm_size <= gpu_count); + FT_CHECK(tensor_para_size > 0 && pipeline_para_size > 0); + FT_CHECK(device_id_start + (int)local_comm_size <= gpu_count); - std::vector nccl_ids; + std::vector nccl_ids; if (tensor_para_size > 1 || pipeline_para_size > 1) { nccl_ids.resize(tensor_para_size + pipeline_para_size); if (node_id == 0) { for (uint32_t i = 0; i < nccl_ids.size(); i++) { - ft::ftNcclGetUniqueId(nccl_ids[i]); + ftNcclGetUniqueId(nccl_ids[i]); } } } - std::vector tensor_para_params(local_comm_size); - std::vector pipeline_para_params(local_comm_size); + std::vector tensor_para_params(local_comm_size); + std::vector pipeline_para_params(local_comm_size); // Don't init comm when size == 1 if (tensor_para_size > 1) { - const auto group_id = ft::ftNcclNextGroupId(); - ft::ftNcclGroupStart(); + const auto group_id = ftNcclNextGroupId(); + ftNcclGroupStart(); for (int gid = device_id_start; gid < device_id_start + local_comm_size; gid++) { int rank = node_id * gpu_count + gid - device_id_start; int tensor_para_rank = rank % tensor_para_size; int pipeline_para_rank = rank / tensor_para_size; - ft::NcclUid tensor_para_nccl_uid = nccl_ids[pipeline_para_rank]; - ft::check_cuda_error(cudaSetDevice(gid)); - ft::ftNcclCommInitRank( + NcclUid tensor_para_nccl_uid = nccl_ids[pipeline_para_rank]; + check_cuda_error(cudaSetDevice(gid)); + ftNcclCommInitRank( tensor_para_params[gid - device_id_start], tensor_para_rank, tensor_para_size, tensor_para_nccl_uid); tensor_para_params[gid - device_id_start].group_id_ = group_id; } - ft::ftNcclGroupEnd(); + ftNcclGroupEnd(); } if (pipeline_para_size > 1) { - const auto group_id = ft::ftNcclNextGroupId(); - ft::ftNcclGroupStart(); + const auto group_id = ftNcclNextGroupId(); + ftNcclGroupStart(); for (int gid = device_id_start; gid < device_id_start + local_comm_size; gid++) { int rank = node_id * gpu_count + gid - device_id_start; int tensor_para_rank = rank % tensor_para_size; int pipeline_para_rank = rank / tensor_para_size; - ft::NcclUid pipeline_para_nccl_uid = nccl_ids[pipeline_para_size + tensor_para_rank]; - ft::check_cuda_error(cudaSetDevice(gid)); - ft::ftNcclCommInitRank(pipeline_para_params[gid - device_id_start], - pipeline_para_rank, - pipeline_para_size, - pipeline_para_nccl_uid); + NcclUid pipeline_para_nccl_uid = nccl_ids[pipeline_para_size + tensor_para_rank]; + check_cuda_error(cudaSetDevice(gid)); + ftNcclCommInitRank(pipeline_para_params[gid - device_id_start], + pipeline_para_rank, + pipeline_para_size, + pipeline_para_nccl_uid); pipeline_para_params[gid - device_id_start].group_id_ = group_id; } - ft::ftNcclGroupEnd(); + ftNcclGroupEnd(); } - return std::pair, std::vector>(tensor_para_params, pipeline_para_params); + return std::pair, std::vector>(tensor_para_params, pipeline_para_params); } + +} // namespace turbomind diff --git a/src/turbomind/triton_backend/transformer_triton_backend.hpp b/src/turbomind/triton_backend/transformer_triton_backend.hpp index 066d75a780..6d49df4578 100644 --- a/src/turbomind/triton_backend/transformer_triton_backend.hpp +++ b/src/turbomind/triton_backend/transformer_triton_backend.hpp @@ -30,242 +30,11 @@ #include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/custom_ar_comm.h" -#include "src/turbomind/utils/instance_comm.h" #include "src/turbomind/utils/nccl_utils.h" -namespace ft = turbomind; +namespace turbomind { -namespace triton { -#ifdef USE_TRITONSERVER_DATATYPE - -#include "triton/core/tritonbackend.h" -#include "triton/core/tritonserver.h" - -#ifndef TRITONSERVER_API_VERSION_MAJOR -#error TRITONSERVER_API_VERSION_MAJOR Undefined! -#endif - -#ifndef TRITONSERVER_API_VERSION_MINOR -#error TRITONSERVER_API_VERSION_MINOR Undefined! -#endif - -#if (TRITONSERVER_API_VERSION_MAJOR == 1 && TRITONSERVER_API_VERSION_MINOR >= 17) \ - || (TRITONSERVER_API_VERSION_MAJOR > 1) -#define ENABLE_TRITON_BF16 1 -#endif - -typedef TRITONSERVER_DataType DataType; -typedef TRITONSERVER_MemoryType MemoryType; - -constexpr TRITONSERVER_DataType TYPE_INVALID = TRITONSERVER_TYPE_INVALID; -constexpr TRITONSERVER_DataType TYPE_BOOL = TRITONSERVER_TYPE_BOOL; -constexpr TRITONSERVER_DataType TYPE_UINT8 = TRITONSERVER_TYPE_UINT8; -constexpr TRITONSERVER_DataType TYPE_UINT16 = TRITONSERVER_TYPE_UINT16; -constexpr TRITONSERVER_DataType TYPE_UINT32 = TRITONSERVER_TYPE_UINT32; -constexpr TRITONSERVER_DataType TYPE_UINT64 = TRITONSERVER_TYPE_UINT64; -constexpr TRITONSERVER_DataType TYPE_INT8 = TRITONSERVER_TYPE_INT8; -constexpr TRITONSERVER_DataType TYPE_INT16 = TRITONSERVER_TYPE_INT16; -constexpr TRITONSERVER_DataType TYPE_INT32 = TRITONSERVER_TYPE_INT32; -constexpr TRITONSERVER_DataType TYPE_INT64 = TRITONSERVER_TYPE_INT64; -constexpr TRITONSERVER_DataType TYPE_FP16 = TRITONSERVER_TYPE_FP16; -constexpr TRITONSERVER_DataType TYPE_FP32 = TRITONSERVER_TYPE_FP32; -constexpr TRITONSERVER_DataType TYPE_FP64 = TRITONSERVER_TYPE_FP64; -constexpr TRITONSERVER_DataType TYPE_BYTES = TRITONSERVER_TYPE_BYTES; - -#ifdef ENABLE_TRITON_BF16 -constexpr TRITONSERVER_DataType TYPE_BF16 = TRITONSERVER_TYPE_BF16; -#endif -constexpr TRITONSERVER_MemoryType MEMORY_CPU = TRITONSERVER_MEMORY_CPU; -constexpr TRITONSERVER_MemoryType MEMORY_CPU_PINNED = TRITONSERVER_MEMORY_CPU_PINNED; -constexpr TRITONSERVER_MemoryType MEMORY_GPU = TRITONSERVER_MEMORY_GPU; - -#else - -typedef ft::DataType DataType; -typedef ft::MemoryType MemoryType; - -constexpr DataType TYPE_INVALID = ft::TYPE_INVALID; -constexpr DataType TYPE_BOOL = ft::TYPE_BOOL; -constexpr DataType TYPE_UINT8 = ft::TYPE_UINT8; -constexpr DataType TYPE_UINT16 = ft::TYPE_UINT16; -constexpr DataType TYPE_UINT32 = ft::TYPE_UINT32; -constexpr DataType TYPE_UINT64 = ft::TYPE_UINT64; -constexpr DataType TYPE_INT8 = ft::TYPE_INT8; -constexpr DataType TYPE_INT16 = ft::TYPE_INT16; -constexpr DataType TYPE_INT32 = ft::TYPE_INT32; -constexpr DataType TYPE_INT64 = ft::TYPE_INT64; -constexpr DataType TYPE_FP16 = ft::TYPE_FP16; -constexpr DataType TYPE_FP32 = ft::TYPE_FP32; -constexpr DataType TYPE_FP64 = ft::TYPE_FP64; -constexpr DataType TYPE_BYTES = ft::TYPE_BYTES; -constexpr DataType TYPE_BF16 = ft::TYPE_BF16; -constexpr MemoryType MEMORY_CPU = ft::MEMORY_CPU; -constexpr MemoryType MEMORY_CPU_PINNED = ft::MEMORY_CPU_PINNED; -constexpr MemoryType MEMORY_GPU = ft::MEMORY_GPU; - -#endif - -struct Tensor { - const MemoryType where; - const DataType type; - const std::vector shape; - const void* data; - - Tensor(const MemoryType _where, const DataType _type, const std::vector _shape, const void* _data): - where(_where), type(_type), shape(_shape), data(_data) - { - } - - static ft::DataType convertTritonTypeToFt(DataType tmp_type) - { - ft::DataType ft_data_type; - switch (tmp_type) { - case TYPE_INVALID: - ft_data_type = ft::DataType::TYPE_INVALID; - break; - case TYPE_BOOL: - ft_data_type = ft::DataType::TYPE_BOOL; - break; - case TYPE_UINT8: - ft_data_type = ft::DataType::TYPE_UINT8; - break; - case TYPE_UINT16: - ft_data_type = ft::DataType::TYPE_UINT16; - break; - case TYPE_UINT32: - ft_data_type = ft::DataType::TYPE_UINT32; - break; - case TYPE_UINT64: - ft_data_type = ft::DataType::TYPE_UINT64; - break; - case TYPE_INT8: - ft_data_type = ft::DataType::TYPE_INT8; - break; - case TYPE_INT16: - ft_data_type = ft::DataType::TYPE_INT16; - break; - case TYPE_INT32: - ft_data_type = ft::DataType::TYPE_INT32; - break; - case TYPE_INT64: - ft_data_type = ft::DataType::TYPE_INT64; - break; - case TYPE_FP16: - ft_data_type = ft::DataType::TYPE_FP16; - break; - case TYPE_FP32: - ft_data_type = ft::DataType::TYPE_FP32; - break; - case TYPE_FP64: - ft_data_type = ft::DataType::TYPE_FP64; - break; -#ifdef ENABLE_TRITON_BF16 - case TYPE_BF16: - ft_data_type = ft::DataType::TYPE_BF16; - break; -#endif - case TYPE_BYTES: - ft_data_type = ft::DataType::TYPE_BYTES; - break; - default: - FT_CHECK_WITH_INFO(false, "Unknown data type with type id: " + std::to_string(tmp_type)); - break; - } - return ft_data_type; - } - - ft::Tensor convertTritonTensorToFt() - { - ft::DataType ft_data_type = convertTritonTypeToFt(type); - ft::MemoryType ft_memory_type; - switch (where) { - case MEMORY_CPU: - ft_memory_type = ft::MemoryType::MEMORY_CPU; - break; - case MEMORY_CPU_PINNED: - ft_memory_type = ft::MemoryType::MEMORY_CPU_PINNED; - break; - case MEMORY_GPU: - ft_memory_type = ft::MemoryType::MEMORY_GPU; - break; - } - return ft::Tensor{ft_memory_type, ft_data_type, shape, data}; - } - - static Tensor convertFtTensorToTriton(ft::Tensor ft_tensor) - { - DataType triton_data_type; - switch (ft_tensor.type) { - case TYPE_INVALID: - triton_data_type = TYPE_INVALID; - break; - case TYPE_BOOL: - triton_data_type = TYPE_BOOL; - break; - case TYPE_UINT8: - triton_data_type = TYPE_UINT8; - break; - case TYPE_UINT16: - triton_data_type = TYPE_UINT16; - break; - case TYPE_UINT32: - triton_data_type = TYPE_UINT32; - break; - case TYPE_UINT64: - triton_data_type = TYPE_UINT64; - break; - case TYPE_INT8: - triton_data_type = TYPE_INT8; - break; - case TYPE_INT16: - triton_data_type = TYPE_INT16; - break; - case TYPE_INT32: - triton_data_type = TYPE_INT32; - break; - case TYPE_INT64: - triton_data_type = TYPE_INT64; - break; - case TYPE_FP16: - triton_data_type = TYPE_FP16; - break; - case TYPE_FP32: - triton_data_type = TYPE_FP32; - break; - case TYPE_FP64: - triton_data_type = TYPE_FP64; - break; -#ifdef ENABLE_TRITON_BF16 - case TYPE_BF16: - triton_data_type = TYPE_BF16; - break; -#endif - case TYPE_BYTES: - triton_data_type = TYPE_BYTES; - break; - default: - FT_CHECK_WITH_INFO(false, "Unknown data type with type id: " + std::to_string(ft_tensor.type)); - break; - } - MemoryType triton_memory_type; - switch (ft_tensor.where) { - case MEMORY_CPU: - triton_memory_type = MEMORY_CPU; - break; - case MEMORY_CPU_PINNED: - triton_memory_type = MEMORY_CPU_PINNED; - break; - case MEMORY_GPU: - triton_memory_type = MEMORY_GPU; - break; - } - return Tensor{triton_memory_type, triton_data_type, ft_tensor.shape, ft_tensor.data}; - } -}; - -} // namespace triton - -using triton_stream_cb_t = std::function>, void*)>; +using triton_stream_cb_t = std::function>, void*)>; struct AbstractTransformerModel; struct AbstractTransformerModelInstance; @@ -273,17 +42,8 @@ struct AbstractTransformerModelInstance; struct AbstractTransformerModelInstance { virtual ~AbstractTransformerModelInstance() = default; - virtual std::shared_ptr> - forward(std::shared_ptr> input_tensors) = 0; - - virtual std::shared_ptr> - forward(std::shared_ptr> input_tensors) = 0; - - virtual std::shared_ptr> - forward(std::shared_ptr> input_tensors, ft::AbstractInstanceComm*) - { - return forward(input_tensors); - } + virtual std::shared_ptr> + forward(std::shared_ptr> input_tensors) = 0; void registerCallback(triton_stream_cb_t cb, void* ctx) { @@ -301,43 +61,38 @@ struct AbstractTransformerModelInstance { void* stream_ctx_ = nullptr; }; -using TensorMap = std::unordered_map; - struct AbstractTransformerModel { static std::shared_ptr createLlamaModel(std::string model_dir); virtual ~AbstractTransformerModel() = default; - virtual std::pair, std::vector> + virtual std::pair, std::vector> createNcclParams(const int node_id, const int device_id_start = 0, const bool multi_node = false); - virtual void createCustomComms(std::vector>* custom_all_reduce_comms, - int world_size) = 0; - - virtual std::unique_ptr createInstanceComm(int size) - { - return nullptr; - } + virtual void createCustomComms(std::vector>* custom_all_reduce_comms, + int world_size) = 0; virtual std::unique_ptr - createModelInstance(int deviceId, - int rank, - cudaStream_t stream, - std::pair, std::vector> nccl_params, - std::shared_ptr custom_all_reduce_comm = nullptr) = 0; + createModelInstance(int deviceId, + int rank, + cudaStream_t stream, + std::pair, std::vector> nccl_params, + std::shared_ptr custom_all_reduce_comm = nullptr) = 0; virtual void createSharedWeights(int deviceId, int rank) = 0; - virtual TensorMap getParams(int deviceId, int rank) = 0; + virtual std::unordered_map getParams(int deviceId, int rank) = 0; virtual void processWeights(int deviceId, int rank) = 0; - virtual void createEngine(int device_id, - int rank, - std::pair, std::vector> nccl_params, - std::shared_ptr) = 0; + virtual void createEngine(int device_id, + int rank, + std::pair, std::vector> nccl_params, + std::shared_ptr) = 0; virtual std::string toString() = 0; virtual int getTensorParaSize() = 0; virtual int getPipelineParaSize() = 0; }; + +} // namespace turbomind diff --git a/src/turbomind/utils/Tensor.h b/src/turbomind/utils/Tensor.h index 6214f6bbc2..b2b8524e09 100644 --- a/src/turbomind/utils/Tensor.h +++ b/src/turbomind/utils/Tensor.h @@ -515,6 +515,16 @@ class TensorMap { return tensor_map_.end(); } + int count(const std::string& key) const + { + return tensor_map_.count(key); + } + + bool empty() const + { + return tensor_map_.empty(); + } + std::string toString(); static TensorMap fromNpyFolder(const std::string& base_folder); void saveNpy(const std::string& base_folder); diff --git a/src/turbomind/utils/instance_comm.h b/src/turbomind/utils/instance_comm.h deleted file mode 100644 index 5a25360a05..0000000000 --- a/src/turbomind/utils/instance_comm.h +++ /dev/null @@ -1,16 +0,0 @@ -#pragma once - -namespace turbomind { - -class AbstractInstanceComm { -public: - virtual ~AbstractInstanceComm() = default; - - virtual void barrier() = 0; - - virtual void setSharedObject(void*) = 0; - - virtual void* getSharedObject() = 0; -}; - -} // namespace turbomind