pytorch · shrinath-suresh · Aug 25, 2023 · Aug 25, 2023 · Aug 28, 2023 · Aug 28, 2023
diff --git a/cpp/build.sh b/cpp/build.sh
@@ -299,6 +299,10 @@ function build() {
     mv $DEPS_DIR/../src/examples/libmnist_handler.so $DEPS_DIR/../../test/resources/torchscript_model/mnist/mnist_handler/libmnist_handler.so
   fi
 
+  if [ -f "$DEPS_DIR/../src/examples/libllm_handler.so" ]; then
+    mv $DEPS_DIR/../src/examples/libllm_handler.so $DEPS_DIR/../../test/resources/torchscript_model/babyllama/libllm_handler.so
+  fi
+
   cd $DEPS_DIR/../..
   if [ -f "$DEPS_DIR/../test/torchserve_cpp_test" ]; then
     $DEPS_DIR/../test/torchserve_cpp_test

diff --git a/cpp/src/examples/CMakeLists.txt b/cpp/src/examples/CMakeLists.txt
@@ -5,3 +5,11 @@ list(APPEND MNIST_SOURCE_FILES ${MNIST_SRC_DIR}/mnist_handler.cc)
 add_library(mnist_handler SHARED ${MNIST_SOURCE_FILES})
 target_include_directories(mnist_handler PUBLIC ${MNIST_SRC_DIR})
 target_link_libraries(mnist_handler PRIVATE ts_backends_torch_scripted ts_utils ${TORCH_LIBRARIES}) 
+
+set(LLM_SRC_DIR "${torchserve_cpp_SOURCE_DIR}/src/examples/babyllama")
+set(LLM_SOURCE_FILES "")
+list(APPEND LLM_SOURCE_FILES ${LLM_SRC_DIR}/baby_llama_handler.cc)
+add_library(llm_handler SHARED ${LLM_SOURCE_FILES})
+target_include_directories(llm_handler PUBLIC ${LLM_SRC_DIR})
+target_link_libraries(llm_handler PRIVATE ts_backends_torch_scripted ts_utils ${TORCH_LIBRARIES})
+target_compile_options(llm_handler PRIVATE -Wall -Wextra -Ofast)
diff --git a/cpp/src/examples/babyllama/baby_llama_handler.cc b/cpp/src/examples/babyllama/baby_llama_handler.cc
@@ -0,0 +1,297 @@
+#include "src/examples/babyllama/baby_llama_handler.hh"
+
+#include <typeinfo>
+
+#include "src/examples/babyllama/llama2.c/run.c"
+
+namespace llm {
+
+Transformer transformer;
+Tokenizer tokenizer;
+Sampler sampler;
+int steps = 256;
+
+std::pair<std::shared_ptr<torch::jit::script::Module>,
+          std::shared_ptr<torch::Device>>
+LlmHandler::LoadModel(
+    std::shared_ptr<torchserve::LoadModelRequest>& load_model_request) {
+  try {
+    auto device = GetTorchDevice(load_model_request);
+    // Load dummy model
+    auto module = std::make_shared<torch::jit::script::Module>(
+        torch::jit::load(fmt::format("{}/{}", load_model_request->model_dir,
+                                     manifest_->GetModel().serialized_file),
+                         *device));
+
+    const std::string configFilePath =
+        fmt::format("{}/{}", load_model_request->model_dir, "config.json");
+    std::string jsonContent;
+    if (!folly::readFile(configFilePath.c_str(), jsonContent)) {
+      std::cerr << "config.json not found at: " << configFilePath << std::endl;
+      throw;
+    }
+    folly::dynamic json;
+    json = folly::parseJson(jsonContent);
+    std::string checkpoint_path;
+    std::string tokenizer_path;
+    if (json.find("checkpoint_path") != json.items().end() &&
+        json.find("tokenizer_path") != json.items().end()) {
+      checkpoint_path = json["checkpoint_path"].asString();
+      tokenizer_path = json["tokenizer_path"].asString();
+    } else {
+      std::cerr
+          << "Required fields 'model_name' and 'model_path' not found in JSON."
+          << std::endl;
+      throw;
+    }
+
+    build_transformer(&transformer, const_cast<char*>(checkpoint_path.c_str()));
+
+    build_tokenizer(&tokenizer, const_cast<char*>(tokenizer_path.c_str()),
+                    transformer.config.vocab_size);
+
+    float temperature =
+        1.0f;  // 0.0 = greedy deterministic. 1.0 = original. don't set higher
+    float topp = 0.9f;  // top-p in nucleus sampling. 1.0 = off. 0.9 works well,
+                        // but slower
+    unsigned long long rng_seed;
+    // build the Sampler
+    build_sampler(&sampler, transformer.config.vocab_size, temperature, topp,
+                  rng_seed);
+
+    return std::make_pair(module, device);
+  } catch (const c10::Error& e) {
+    TS_LOGF(ERROR, "loading the model: {}, device id: {}, error: {}",
+            load_model_request->model_name, load_model_request->gpu_id,
+            e.msg());
+    throw e;
+  } catch (const std::runtime_error& e) {
+    TS_LOGF(ERROR, "loading the model: {}, device id: {}, error: {}",
+            load_model_request->model_name, load_model_request->gpu_id,
+            e.what());
+    throw e;
+  }
+}
+
+std::vector<torch::jit::IValue> LlmHandler::Preprocess(
+    std::shared_ptr<torch::Device>& device,
+    std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id,
+    std::shared_ptr<torchserve::InferenceRequestBatch>& request_batch,
+    std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch) {
+  std::vector<torch::jit::IValue> batch_ivalue;
+  std::vector<torch::Tensor> batch_tensors;
+  uint8_t idx = 0;
+  for (auto& request : *request_batch) {
+    try {
+      (*response_batch)[request.request_id] =
+          std::make_shared<torchserve::InferenceResponse>(request.request_id);
+      idx_to_req_id.first += idx_to_req_id.first.empty()
+                                 ? request.request_id
+                                 : "," + request.request_id;
+
+      auto data_it = request.parameters.find(
+          torchserve::PayloadType::kPARAMETER_NAME_DATA);
+      auto dtype_it =
+          request.headers.find(torchserve::PayloadType::kHEADER_NAME_DATA_TYPE);
+      if (data_it == request.parameters.end()) {
+        data_it = request.parameters.find(
+            torchserve::PayloadType::kPARAMETER_NAME_BODY);
+        dtype_it = request.headers.find(
+            torchserve::PayloadType::kHEADER_NAME_BODY_TYPE);
+      }
+
+      if (data_it == request.parameters.end() ||
+          dtype_it == request.headers.end()) {
+        TS_LOGF(ERROR, "Empty payload for request id: {}", request.request_id);
+        (*response_batch)[request.request_id]->SetResponse(
+            500, "data_type", torchserve::PayloadType::kCONTENT_TYPE_TEXT,
+            "Empty payload");
+        continue;
+      }
+
+      std::string msg = torchserve::Converter::VectorToStr(data_it->second);
+
+      int num_prompt_tokens = 0;
+
+      std::unique_ptr<char[], void (*)(char*)> msgCStr(
+          new char[msg.size() + 1], [](char* ptr) { delete[] ptr; });
+
+      std::strcpy(msgCStr.get(), msg.c_str());
+
+      std::unique_ptr<int[]> prompt_tokens(new int[msg.length()]);
+
+      encode(&tokenizer, msgCStr.get(), 1, 0, prompt_tokens.get(),
+             &num_prompt_tokens);
+
+      std::vector<torch::Tensor> tensor_vector;
+      for (int64_t i = 0; i < num_prompt_tokens; ++i) {
+        int token = prompt_tokens[i];
+        torch::Tensor tensor = torch::tensor(token, torch::kInt64);
+        tensor_vector.push_back(tensor);
+      }
+      torch::Tensor stacked_tensor = torch::stack(tensor_vector);
+      batch_ivalue.push_back(stacked_tensor);
+
+      idx_to_req_id.second[idx++] = request.request_id;
+
+    } catch (const std::runtime_error& e) {
+      TS_LOGF(ERROR, "Failed to load tensor for request id: {}, error: {}",
+              request.request_id, e.what());
+      auto response = (*response_batch)[request.request_id];
+      response->SetResponse(500, "data_type",
+                            torchserve::PayloadType::kDATA_TYPE_STRING,
+                            "runtime_error, failed to load tensor");
+    } catch (const c10::Error& e) {
+      TS_LOGF(ERROR, "Failed to load tensor for request id: {}, c10 error:{}",
+              request.request_id, e.msg());
+      auto response = (*response_batch)[request.request_id];
+      response->SetResponse(500, "data_type",
+                            torchserve::PayloadType::kDATA_TYPE_STRING,
+                            "c10 error, failed to load tensor");
+    }
+  }
+
+  return batch_ivalue;
+}
+
+torch::Tensor LlmHandler::Inference(
+    std::shared_ptr<torch::jit::script::Module> model,
+    std::vector<torch::jit::IValue>& inputs,
+    std::shared_ptr<torch::Device>& device,
+    std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id,
+    std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch) {
+  c10::InferenceMode guard;
+  std::vector<torch::Tensor> batch_output_vector;
+  for (const torch::jit::IValue& input : inputs) {
+    std::vector<torch::Tensor> tensor_vector;
+    tensor_vector.reserve(steps);
+    torch::Tensor tokens_list_tensor = input.toTensor();
+
+    int64_t num_elements = tokens_list_tensor.numel();
+
+    int64_t* data_ptr = tokens_list_tensor.data_ptr<int64_t>();
+
+    std::unique_ptr<int[]> prompt_tokens(new int[num_elements]);
+
+    for (int64_t i = 0; i < num_elements; ++i) {
+      prompt_tokens[i] = data_ptr[i];
+    }
+
+    // start the main loop
+    long start =
+        0;     // used to time our code, only initialized after first iteration
+    int next;  // will store the next token in the sequence
+    int token =
+        prompt_tokens[0];  // kick off with the first token in the prompt
+    int pos = 0;           // position in the sequence
+    while (pos < steps) {
+      // forward the transformer to get logits for the next token
+      float* logits = forward(&transformer, token, pos);
+
+      // advance the state state machine
+      if (pos < num_elements - 1) {
+        // if we are still processing the input prompt, force the next prompt
+        // token
+        next = prompt_tokens[pos + 1];
+      } else {
+        // otherwise sample the next token from the logits
+        next = sample(&sampler, logits);
+      }
+      pos++;
+
+      torch::Tensor tensor = torch::tensor(next, torch::kLong);
+      tensor_vector.push_back(tensor);
+
+      // data-dependent terminating condition: the BOS (=1) token delimits
+      // sequences
+      if (next == 1) {
+        break;
+      }
+      token = next;
+
+      // init the timer here because the first iteration can be slower
+      if (start == 0) {
+        start = time_in_ms();
+      }
+    }
+
+    // report achieved tok/s (pos-1 because the timer starts after first
+    // iteration)
+    if (pos > 1) {
+      long end = time_in_ms();
+      double token_per_sec = (pos - 1) / (double)(end - start) * 1000;
+      std::cout << "Achieved tok per sec: " << token_per_sec << std::endl;
+    }
+
+    torch::Tensor stacked_tensor = torch::stack(tensor_vector);
+
+    batch_output_vector.push_back(stacked_tensor);
+  }
+
+  return torch::stack(batch_output_vector);
+}
+
+void LlmHandler::Postprocess(
+    const torch::Tensor& data,
+    std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id,
+    std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch) {
+  for (const auto& kv : idx_to_req_id.second) {
+    try {
+      int64_t num_elements = data.numel();
+      int64_t* data_ptr = data.data_ptr<int64_t>();
+      int64_t token = 1;
+      std::string concatenated_string;
+      for (int64_t i = 0; i < num_elements; ++i) {
+        char* piece = decode(&tokenizer, token, data_ptr[i]);
+        std::string piece_string(piece);
+        token = data_ptr[i];
+        concatenated_string += piece_string;
+      }
+
+      std::cout << "Generated String:  " << concatenated_string << std::endl;
+
+      auto response = (*response_batch)[kv.second];
+
+      response->SetResponse(200, "data_type",
+                            torchserve::PayloadType::kDATA_TYPE_STRING,
+                            concatenated_string);
+    } catch (const std::runtime_error& e) {
+      TS_LOGF(ERROR, "Failed to load tensor for request id: {}, error: {}",
+              kv.second, e.what());
+      auto response = (*response_batch)[kv.second];
+      response->SetResponse(500, "data_type",
+                            torchserve::PayloadType::kDATA_TYPE_STRING,
+                            "runtime_error, failed to postprocess tensor");
+    } catch (const c10::Error& e) {
+      TS_LOGF(ERROR,
+              "Failed to postprocess tensor for request id: {}, error: {}",
+              kv.second, e.msg());
+      auto response = (*response_batch)[kv.second];
+      response->SetResponse(500, "data_type",
+                            torchserve::PayloadType::kDATA_TYPE_STRING,
+                            "c10 error, failed to postprocess tensor");
+    }
+  }
+}
+
+LlmHandler::~LlmHandler() noexcept {
+  free_sampler(&sampler);
+  free_tokenizer(&tokenizer);
+  free_transformer(&transformer);
+}
+
+}  // namespace llm
+
+#if defined(__linux__) || defined(__APPLE__)
+extern "C" {
+torchserve::torchscripted::BaseHandler* allocatorLlmHandler() {
+  return new llm::LlmHandler();
+}
+
+void deleterLlmHandler(torchserve::torchscripted::BaseHandler* p) {
+  if (p != nullptr) {
+    delete static_cast<llm::LlmHandler*>(p);
+  }
+}
+}
+#endif
diff --git a/cpp/src/examples/babyllama/baby_llama_handler.hh b/cpp/src/examples/babyllama/baby_llama_handler.hh
@@ -0,0 +1,47 @@
+#ifndef LLM_HANDLER_HH_
+#define LLM_HANDLER_HH_
+
+#include <folly/FileUtil.h>
+#include <folly/json.h>
+
+#include <iostream>
+
+#include "src/backends/torch_scripted/handler/base_handler.hh"
+
+namespace llm {
+class LlmHandler : public torchserve::torchscripted::BaseHandler {
+ public:
+  // NOLINTBEGIN(bugprone-exception-escape)
+  LlmHandler() = default;
+  // NOLINTEND(bugprone-exception-escape)
+  ~LlmHandler() noexcept;
+
+  void initialize_context();
+
+  virtual std::pair<std::shared_ptr<torch::jit::script::Module>,
+                    std::shared_ptr<torch::Device>>
+  LoadModel(std::shared_ptr<torchserve::LoadModelRequest>& load_model_request);
+
+  std::vector<torch::jit::IValue> Preprocess(
+      std::shared_ptr<torch::Device>& device,
+      std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id,
+      std::shared_ptr<torchserve::InferenceRequestBatch>& request_batch,
+      std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch)
+      override;
+
+  torch::Tensor Inference(
+      std::shared_ptr<torch::jit::script::Module> model,
+      std::vector<torch::jit::IValue>& inputs,
+      std::shared_ptr<torch::Device>& device,
+      std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id,
+      std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch)
+      override;
+
+  void Postprocess(
+      const torch::Tensor& data,
+      std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id,
+      std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch)
+      override;
+};
+}  // namespace llm
+#endif  // LLM_HANDLER_HH_
diff --git a/cpp/src/examples/babyllama/llama2.c/LICENSE b/cpp/src/examples/babyllama/llama2.c/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Andrej
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.