Skip to content

Commit

Permalink
GGUF Compatibility
Browse files Browse the repository at this point in the history
Signed-off-by: Shrinath Suresh <[email protected]>
  • Loading branch information
shrinath-suresh committed Sep 13, 2023
1 parent f351d1d commit e3a753c
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 77 deletions.
4 changes: 2 additions & 2 deletions cpp/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -299,8 +299,8 @@ function build() {
mv $DEPS_DIR/../src/examples/libmnist_handler.so $DEPS_DIR/../../test/resources/torchscript_model/mnist/mnist_handler/libmnist_handler.so
fi

if [ -f "$DEPS_DIR/../src/examples/libllm_handler.so" ]; then
mv $DEPS_DIR/../src/examples/libllm_handler.so $DEPS_DIR/../../test/resources/torchscript_model/llm/llm_handler/libllm_handler.so
if [ -f "$DEPS_DIR/../src/examples/libllamacpp_handler.so" ]; then
mv $DEPS_DIR/../src/examples/libllamacpp_handler.so $DEPS_DIR/../../test/resources/torchscript_model/llamacpp/llamacpp_handler/libllamacpp_handler.so
fi

cd $DEPS_DIR/../..
Expand Down
14 changes: 7 additions & 7 deletions cpp/src/examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@ add_library(mnist_handler SHARED ${MNIST_SOURCE_FILES})
target_include_directories(mnist_handler PUBLIC ${MNIST_SRC_DIR})
target_link_libraries(mnist_handler PRIVATE ts_backends_torch_scripted ts_utils ${TORCH_LIBRARIES})

set(LLM_SRC_DIR "${torchserve_cpp_SOURCE_DIR}/src/examples/llm")
set(LLM_SRC_DIR "${torchserve_cpp_SOURCE_DIR}/src/examples/llamacpp")
set(LLAMACPP_SRC_DIR "/home/ubuntu/llama.cpp")
set(LLM_SOURCE_FILES "")
list(APPEND LLM_SOURCE_FILES ${LLM_SRC_DIR}/llm_handler.cc)
add_library(llm_handler SHARED ${LLM_SOURCE_FILES})
target_include_directories(llm_handler PUBLIC ${LLM_SRC_DIR})
target_include_directories(llm_handler PUBLIC ${LLAMACPP_SRC_DIR})
target_link_libraries(llm_handler PRIVATE ts_backends_torch_scripted ts_utils ${TORCH_LIBRARIES})
list(APPEND LLM_SOURCE_FILES ${LLM_SRC_DIR}/llamacpp_handler.cc)
add_library(llamacpp_handler SHARED ${LLM_SOURCE_FILES})
target_include_directories(llamacpp_handler PUBLIC ${LLM_SRC_DIR})
target_include_directories(llamacpp_handler PUBLIC ${LLAMACPP_SRC_DIR})
target_link_libraries(llamacpp_handler PRIVATE ts_backends_torch_scripted ts_utils ${TORCH_LIBRARIES})


set(MY_OBJECT_FILES
Expand All @@ -27,4 +27,4 @@ set(MY_OBJECT_FILES

)

target_sources(llm_handler PRIVATE ${MY_OBJECT_FILES})
target_sources(llamacpp_handler PRIVATE ${MY_OBJECT_FILES})
Original file line number Diff line number Diff line change
@@ -1,31 +1,14 @@
nclude "src/examples/image_classifier/llm/llm_handler.hh"
#include "src/examples/llamacpp/llamacpp_handler.hh"

#include <torch/script.h>
#include <torch/torch.h>

#include <typeinfo>

#include "examples/common.h"
#include "ggml.h"
#include "llama.h"

namespace llm {

void LlmHandler::initialize_context() {
// gpt_params params;
params.seed = 42;
params.n_threads = 4;
params.repeat_last_n = 64;

auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_gqa = params.n_gqa;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;

llama_ctx = llama_new_context_with_model(llamamodel, lparams);
void LlamacppHandler::initialize_context() {
llama_ctx = llama_new_context_with_model(llamamodel, ctx_params);

if (llama_ctx == nullptr) {
std::cerr << "Failed to initialize llama context" << std::endl;
Expand All @@ -36,7 +19,7 @@ void LlmHandler::initialize_context() {

std::pair<std::shared_ptr<torch::jit::script::Module>,
std::shared_ptr<torch::Device>>
LlmHandler::LoadModel(
LlamacppHandler::LoadModel(
std::shared_ptr<torchserve::LoadModelRequest>& load_model_request) {
try {
auto device = GetTorchDevice(load_model_request);
Expand All @@ -46,24 +29,13 @@ LlmHandler::LoadModel(
manifest_->GetModel().serialized_file),
*device));

params.model = "/home/ubuntu/serve/cpp/llama-2-7b-chat.ggmlv3.q4_0.bin";
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_gqa = params.n_gqa;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
llamamodel = llama_load_model_from_file(params.model.c_str(), lparams);
// llama_ctx = llama_new_context_with_model(llamamodel, lparams);
// initialize_context();

// // Load LLM
// gpt_params params;
// // TODO: Fetch the path from context
// params.model = "/home/ubuntu/serve/cpp/llama-2-7b-chat.ggmlv3.q4_0.bin";
// llama_backend_init(params.numa);
// std::tie(llamamodel, llama_ctx) = llama_init_from_gpt_params(params);
params.model = "/home/ubuntu/gpu/llama.cpp/llama-2-7b-chat.Q4_0.gguf";
params.main_gpu = 0;
params.n_gpu_layers = 35;

llama_backend_init(params.numa);
ctx_params = llama_context_default_params();
llamamodel = llama_load_model_from_file(params.model.c_str(), ctx_params);

return std::make_pair(module, device);
} catch (const c10::Error& e) {
Expand All @@ -79,7 +51,7 @@ LlmHandler::LoadModel(
}
}

std::vector<torch::jit::IValue> LlmHandler::Preprocess(
std::vector<torch::jit::IValue> LlamacppHandler::Preprocess(
std::shared_ptr<torch::Device>& device,
std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id,
std::shared_ptr<torchserve::InferenceRequestBatch>& request_batch,
Expand Down Expand Up @@ -133,7 +105,6 @@ std::vector<torch::jit::IValue> LlmHandler::Preprocess(
tokens_list = ::llama_tokenize(llama_ctx, msg, true);

// const int max_context_size = llama_n_ctx(ctx);
const int max_context_size = 64;
const int max_tokens_list_size = max_context_size - 4;

if ((int)tokens_list.size() > max_tokens_list_size) {
Expand Down Expand Up @@ -173,7 +144,7 @@ std::vector<torch::jit::IValue> LlmHandler::Preprocess(
return batch_ivalue;
}

torch::Tensor LlmHandler::Inference(
torch::Tensor LlamacppHandler::Inference(
std::shared_ptr<torch::jit::script::Module> model,
std::vector<torch::jit::IValue>& inputs,
std::shared_ptr<torch::Device>& device,
Expand All @@ -197,19 +168,22 @@ torch::Tensor LlmHandler::Inference(
for (auto id : long_vector) {
tokens_list.push_back(id);
}
const int n_gen = std::min(32, max_context_size);

// gpt_params params;

const int max_context_size = 64;
while (llama_get_kv_cache_token_count(llama_ctx) < n_gen) {
// evaluate the transformer

while (llama_get_kv_cache_token_count(llama_ctx) < max_context_size) {
if (llama_eval(llama_ctx, tokens_list.data(), int(tokens_list.size()),
llama_get_kv_cache_token_count(llama_ctx),
params.n_threads)) {
std::cout << "Evaluation Failed" << __func__ << std::endl;
// TODO: Raise exception here
std::cout << "Failed to eval\n" << __func__ << std::endl;
break;
}

tokens_list.clear();

// sample the next token

llama_token new_token_id = 0;

auto logits = llama_get_logits(llama_ctx);
Expand All @@ -228,13 +202,17 @@ torch::Tensor LlmHandler::Inference(

new_token_id = llama_sample_token_greedy(llama_ctx, &candidates_p);

if (new_token_id == llama_token_eos()) {
// is it an end of stream ?
if (new_token_id == llama_token_eos(llama_ctx)) {
std::cout << "Reached [end of text]\n";
break;
}

std::cout << "New Token: " << llama_token_to_str(llama_ctx, new_token_id);
// print the new token :
std::cout << "New Token: " << llama_token_to_piece(llama_ctx, new_token_id)
<< std::endl;

// Push this new token for next evaluation :
// push this new token for next evaluation
tokens_list.push_back(new_token_id);
}

Expand All @@ -245,12 +223,12 @@ torch::Tensor LlmHandler::Inference(
}

torch::Tensor stacked_tensor = torch::stack(tensor_vector);

llama_print_timings(llama_ctx);
llama_free(llama_ctx);
return stacked_tensor;
}

void LlmHandler::Postprocess(
void LlamacppHandler::Postprocess(
const torch::Tensor& data,
std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id,
std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch) {
Expand All @@ -263,7 +241,7 @@ void LlmHandler::Postprocess(

auto data_ptr = data.data_ptr<int64_t>();
for (int64_t i = 0; i < num_elements; ++i) {
generated_text_stream << llama_token_to_str(llama_ctx, data_ptr[i]);
generated_text_stream << llama_token_to_piece(llama_ctx, data_ptr[i]);
}

std::string generated_text_str = generated_text_stream.str();
Expand Down Expand Up @@ -297,13 +275,13 @@ void LlmHandler::Postprocess(

#if defined(__linux__) || defined(__APPLE__)
extern "C" {
torchserve::torchscripted::BaseHandler* allocatorLlmHandler() {
return new llm::LlmHandler();
torchserve::torchscripted::BaseHandler* allocatorLlamacppHandler() {
return new llm::LlamacppHandler();
}

void deleterLlmHandler(torchserve::torchscripted::BaseHandler* p) {
void deleterLlamacppHandler(torchserve::torchscripted::BaseHandler* p) {
if (p != nullptr) {
delete static_cast<llm::LlmHandler*>(p);
delete static_cast<llm::LlamacppHandler*>(p);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,23 +1,25 @@
#ifndef LLM_HANDLER_HH_
#define LLM_HANDLER_HH_

#include "examples/common.h"
#include "common/common.h"
#include "ggml.h"
#include "llama.h"
#include "src/backends/torch_scripted/handler/base_handler.hh"

namespace llm {
class LlmHandler : public torchserve::torchscripted::BaseHandler {
class LlamacppHandler : public torchserve::torchscripted::BaseHandler {
private:
gpt_params params;
llama_model* llamamodel;
llama_context_params ctx_params;
llama_context* llama_ctx;
const int max_context_size = 32;

public:
// NOLINTBEGIN(bugprone-exception-escape)
LlmHandler() = default;
LlamacppHandler() = default;
// NOLINTEND(bugprone-exception-escape)
~LlmHandler() override = default;
~LlamacppHandler() override = default;

void initialize_context();

Expand Down
13 changes: 7 additions & 6 deletions cpp/test/backends/torch_scripted/torch_scripted_backend_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,13 @@ TEST_F(TorchScriptedBackendTest, TestLoadPredictMnistHandler) {
}

TEST_F(TorchScriptedBackendTest, TestLoadPredictLlmHandler) {
this->LoadPredict(std::make_shared<torchserve::LoadModelRequest>(
"test/resources/torchscript_model/llm/llm_handler",
"llm", -1, "", "", 1, false),
"test/resources/torchscript_model/llm/llm_handler",
"test/resources/torchscript_model/llm/llm_handler/prompt.txt",
"llm_ts", 200);
this->LoadPredict(
std::make_shared<torchserve::LoadModelRequest>(
"test/resources/torchscript_model/llamacpp/llamacpp_handler", "llm",
-1, "", "", 1, false),
"test/resources/torchscript_model/llamacpp/llamacpp_handler",
"test/resources/torchscript_model/llamacpp/sentences.json", "llm_ts",
200);
}

TEST_F(TorchScriptedBackendTest, TestBackendInitWrongModelDir) {
Expand Down

0 comments on commit e3a753c

Please sign in to comment.