From e3a753c658b7489ca8f8a979ac7a857123f6d5ae Mon Sep 17 00:00:00 2001 From: Shrinath Suresh Date: Wed, 13 Sep 2023 09:52:32 +0530 Subject: [PATCH] GGUF Compatibility Signed-off-by: Shrinath Suresh --- cpp/build.sh | 4 +- cpp/src/examples/CMakeLists.txt | 14 +-- .../llamacpp_handler.cc} | 94 +++++++------------ .../llamacpp_handler.hh} | 10 +- .../torch_scripted_backend_test.cc | 13 +-- 5 files changed, 58 insertions(+), 77 deletions(-) rename cpp/src/examples/{llm/llm_handler.cc => llamacpp/llamacpp_handler.cc} (79%) rename cpp/src/examples/{llm/llm_handler.hh => llamacpp/llamacpp_handler.hh} (86%) diff --git a/cpp/build.sh b/cpp/build.sh index bd08f7c4a4..2a962d7a9e 100755 --- a/cpp/build.sh +++ b/cpp/build.sh @@ -299,8 +299,8 @@ function build() { mv $DEPS_DIR/../src/examples/libmnist_handler.so $DEPS_DIR/../../test/resources/torchscript_model/mnist/mnist_handler/libmnist_handler.so fi - if [ -f "$DEPS_DIR/../src/examples/libllm_handler.so" ]; then - mv $DEPS_DIR/../src/examples/libllm_handler.so $DEPS_DIR/../../test/resources/torchscript_model/llm/llm_handler/libllm_handler.so + if [ -f "$DEPS_DIR/../src/examples/libllamacpp_handler.so" ]; then + mv $DEPS_DIR/../src/examples/libllamacpp_handler.so $DEPS_DIR/../../test/resources/torchscript_model/llamacpp/llamacpp_handler/libllamacpp_handler.so fi cd $DEPS_DIR/../.. diff --git a/cpp/src/examples/CMakeLists.txt b/cpp/src/examples/CMakeLists.txt index 66d48ee066..6f8441d190 100644 --- a/cpp/src/examples/CMakeLists.txt +++ b/cpp/src/examples/CMakeLists.txt @@ -6,14 +6,14 @@ add_library(mnist_handler SHARED ${MNIST_SOURCE_FILES}) target_include_directories(mnist_handler PUBLIC ${MNIST_SRC_DIR}) target_link_libraries(mnist_handler PRIVATE ts_backends_torch_scripted ts_utils ${TORCH_LIBRARIES}) -set(LLM_SRC_DIR "${torchserve_cpp_SOURCE_DIR}/src/examples/llm") +set(LLM_SRC_DIR "${torchserve_cpp_SOURCE_DIR}/src/examples/llamacpp") set(LLAMACPP_SRC_DIR "/home/ubuntu/llama.cpp") set(LLM_SOURCE_FILES "") -list(APPEND LLM_SOURCE_FILES ${LLM_SRC_DIR}/llm_handler.cc) -add_library(llm_handler SHARED ${LLM_SOURCE_FILES}) -target_include_directories(llm_handler PUBLIC ${LLM_SRC_DIR}) -target_include_directories(llm_handler PUBLIC ${LLAMACPP_SRC_DIR}) -target_link_libraries(llm_handler PRIVATE ts_backends_torch_scripted ts_utils ${TORCH_LIBRARIES}) +list(APPEND LLM_SOURCE_FILES ${LLM_SRC_DIR}/llamacpp_handler.cc) +add_library(llamacpp_handler SHARED ${LLM_SOURCE_FILES}) +target_include_directories(llamacpp_handler PUBLIC ${LLM_SRC_DIR}) +target_include_directories(llamacpp_handler PUBLIC ${LLAMACPP_SRC_DIR}) +target_link_libraries(llamacpp_handler PRIVATE ts_backends_torch_scripted ts_utils ${TORCH_LIBRARIES}) set(MY_OBJECT_FILES @@ -27,4 +27,4 @@ set(MY_OBJECT_FILES ) -target_sources(llm_handler PRIVATE ${MY_OBJECT_FILES}) +target_sources(llamacpp_handler PRIVATE ${MY_OBJECT_FILES}) diff --git a/cpp/src/examples/llm/llm_handler.cc b/cpp/src/examples/llamacpp/llamacpp_handler.cc similarity index 79% rename from cpp/src/examples/llm/llm_handler.cc rename to cpp/src/examples/llamacpp/llamacpp_handler.cc index 09270341cc..fe1a2f7bf0 100644 --- a/cpp/src/examples/llm/llm_handler.cc +++ b/cpp/src/examples/llamacpp/llamacpp_handler.cc @@ -1,31 +1,14 @@ -nclude "src/examples/image_classifier/llm/llm_handler.hh" +#include "src/examples/llamacpp/llamacpp_handler.hh" #include #include #include -#include "examples/common.h" -#include "ggml.h" -#include "llama.h" - namespace llm { -void LlmHandler::initialize_context() { - // gpt_params params; - params.seed = 42; - params.n_threads = 4; - params.repeat_last_n = 64; - - auto lparams = llama_context_default_params(); - lparams.n_ctx = params.n_ctx; - lparams.n_gqa = params.n_gqa; - lparams.seed = params.seed; - lparams.f16_kv = params.memory_f16; - lparams.use_mmap = params.use_mmap; - lparams.use_mlock = params.use_mlock; - - llama_ctx = llama_new_context_with_model(llamamodel, lparams); +void LlamacppHandler::initialize_context() { + llama_ctx = llama_new_context_with_model(llamamodel, ctx_params); if (llama_ctx == nullptr) { std::cerr << "Failed to initialize llama context" << std::endl; @@ -36,7 +19,7 @@ void LlmHandler::initialize_context() { std::pair, std::shared_ptr> -LlmHandler::LoadModel( +LlamacppHandler::LoadModel( std::shared_ptr& load_model_request) { try { auto device = GetTorchDevice(load_model_request); @@ -46,24 +29,13 @@ LlmHandler::LoadModel( manifest_->GetModel().serialized_file), *device)); - params.model = "/home/ubuntu/serve/cpp/llama-2-7b-chat.ggmlv3.q4_0.bin"; - auto lparams = llama_context_default_params(); - lparams.n_ctx = params.n_ctx; - lparams.n_gqa = params.n_gqa; - lparams.seed = params.seed; - lparams.f16_kv = params.memory_f16; - lparams.use_mmap = params.use_mmap; - lparams.use_mlock = params.use_mlock; - llamamodel = llama_load_model_from_file(params.model.c_str(), lparams); - // llama_ctx = llama_new_context_with_model(llamamodel, lparams); - // initialize_context(); - - // // Load LLM - // gpt_params params; - // // TODO: Fetch the path from context - // params.model = "/home/ubuntu/serve/cpp/llama-2-7b-chat.ggmlv3.q4_0.bin"; - // llama_backend_init(params.numa); - // std::tie(llamamodel, llama_ctx) = llama_init_from_gpt_params(params); + params.model = "/home/ubuntu/gpu/llama.cpp/llama-2-7b-chat.Q4_0.gguf"; + params.main_gpu = 0; + params.n_gpu_layers = 35; + + llama_backend_init(params.numa); + ctx_params = llama_context_default_params(); + llamamodel = llama_load_model_from_file(params.model.c_str(), ctx_params); return std::make_pair(module, device); } catch (const c10::Error& e) { @@ -79,7 +51,7 @@ LlmHandler::LoadModel( } } -std::vector LlmHandler::Preprocess( +std::vector LlamacppHandler::Preprocess( std::shared_ptr& device, std::pair&>& idx_to_req_id, std::shared_ptr& request_batch, @@ -133,7 +105,6 @@ std::vector LlmHandler::Preprocess( tokens_list = ::llama_tokenize(llama_ctx, msg, true); // const int max_context_size = llama_n_ctx(ctx); - const int max_context_size = 64; const int max_tokens_list_size = max_context_size - 4; if ((int)tokens_list.size() > max_tokens_list_size) { @@ -173,7 +144,7 @@ std::vector LlmHandler::Preprocess( return batch_ivalue; } -torch::Tensor LlmHandler::Inference( +torch::Tensor LlamacppHandler::Inference( std::shared_ptr model, std::vector& inputs, std::shared_ptr& device, @@ -197,19 +168,22 @@ torch::Tensor LlmHandler::Inference( for (auto id : long_vector) { tokens_list.push_back(id); } + const int n_gen = std::min(32, max_context_size); - // gpt_params params; - - const int max_context_size = 64; + while (llama_get_kv_cache_token_count(llama_ctx) < n_gen) { + // evaluate the transformer - while (llama_get_kv_cache_token_count(llama_ctx) < max_context_size) { if (llama_eval(llama_ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(llama_ctx), params.n_threads)) { - std::cout << "Evaluation Failed" << __func__ << std::endl; - // TODO: Raise exception here + std::cout << "Failed to eval\n" << __func__ << std::endl; + break; } + tokens_list.clear(); + + // sample the next token + llama_token new_token_id = 0; auto logits = llama_get_logits(llama_ctx); @@ -228,13 +202,17 @@ torch::Tensor LlmHandler::Inference( new_token_id = llama_sample_token_greedy(llama_ctx, &candidates_p); - if (new_token_id == llama_token_eos()) { + // is it an end of stream ? + if (new_token_id == llama_token_eos(llama_ctx)) { + std::cout << "Reached [end of text]\n"; break; } - std::cout << "New Token: " << llama_token_to_str(llama_ctx, new_token_id); + // print the new token : + std::cout << "New Token: " << llama_token_to_piece(llama_ctx, new_token_id) + << std::endl; - // Push this new token for next evaluation : + // push this new token for next evaluation tokens_list.push_back(new_token_id); } @@ -245,12 +223,12 @@ torch::Tensor LlmHandler::Inference( } torch::Tensor stacked_tensor = torch::stack(tensor_vector); - + llama_print_timings(llama_ctx); llama_free(llama_ctx); return stacked_tensor; } -void LlmHandler::Postprocess( +void LlamacppHandler::Postprocess( const torch::Tensor& data, std::pair&>& idx_to_req_id, std::shared_ptr& response_batch) { @@ -263,7 +241,7 @@ void LlmHandler::Postprocess( auto data_ptr = data.data_ptr(); for (int64_t i = 0; i < num_elements; ++i) { - generated_text_stream << llama_token_to_str(llama_ctx, data_ptr[i]); + generated_text_stream << llama_token_to_piece(llama_ctx, data_ptr[i]); } std::string generated_text_str = generated_text_stream.str(); @@ -297,13 +275,13 @@ void LlmHandler::Postprocess( #if defined(__linux__) || defined(__APPLE__) extern "C" { -torchserve::torchscripted::BaseHandler* allocatorLlmHandler() { - return new llm::LlmHandler(); +torchserve::torchscripted::BaseHandler* allocatorLlamacppHandler() { + return new llm::LlamacppHandler(); } -void deleterLlmHandler(torchserve::torchscripted::BaseHandler* p) { +void deleterLlamacppHandler(torchserve::torchscripted::BaseHandler* p) { if (p != nullptr) { - delete static_cast(p); + delete static_cast(p); } } } diff --git a/cpp/src/examples/llm/llm_handler.hh b/cpp/src/examples/llamacpp/llamacpp_handler.hh similarity index 86% rename from cpp/src/examples/llm/llm_handler.hh rename to cpp/src/examples/llamacpp/llamacpp_handler.hh index 288c7ef89b..43e77826ac 100644 --- a/cpp/src/examples/llm/llm_handler.hh +++ b/cpp/src/examples/llamacpp/llamacpp_handler.hh @@ -1,23 +1,25 @@ #ifndef LLM_HANDLER_HH_ #define LLM_HANDLER_HH_ -#include "examples/common.h" +#include "common/common.h" #include "ggml.h" #include "llama.h" #include "src/backends/torch_scripted/handler/base_handler.hh" namespace llm { -class LlmHandler : public torchserve::torchscripted::BaseHandler { +class LlamacppHandler : public torchserve::torchscripted::BaseHandler { private: gpt_params params; llama_model* llamamodel; + llama_context_params ctx_params; llama_context* llama_ctx; + const int max_context_size = 32; public: // NOLINTBEGIN(bugprone-exception-escape) - LlmHandler() = default; + LlamacppHandler() = default; // NOLINTEND(bugprone-exception-escape) - ~LlmHandler() override = default; + ~LlamacppHandler() override = default; void initialize_context(); diff --git a/cpp/test/backends/torch_scripted/torch_scripted_backend_test.cc b/cpp/test/backends/torch_scripted/torch_scripted_backend_test.cc index 131893da2d..b18a74fb84 100644 --- a/cpp/test/backends/torch_scripted/torch_scripted_backend_test.cc +++ b/cpp/test/backends/torch_scripted/torch_scripted_backend_test.cc @@ -79,12 +79,13 @@ TEST_F(TorchScriptedBackendTest, TestLoadPredictMnistHandler) { } TEST_F(TorchScriptedBackendTest, TestLoadPredictLlmHandler) { - this->LoadPredict(std::make_shared( - "test/resources/torchscript_model/llm/llm_handler", - "llm", -1, "", "", 1, false), - "test/resources/torchscript_model/llm/llm_handler", - "test/resources/torchscript_model/llm/llm_handler/prompt.txt", - "llm_ts", 200); + this->LoadPredict( + std::make_shared( + "test/resources/torchscript_model/llamacpp/llamacpp_handler", "llm", + -1, "", "", 1, false), + "test/resources/torchscript_model/llamacpp/llamacpp_handler", + "test/resources/torchscript_model/llamacpp/sentences.json", "llm_ts", + 200); } TEST_F(TorchScriptedBackendTest, TestBackendInitWrongModelDir) {