From 5ff955eeffe99ab852836791dcdf072d305252d0 Mon Sep 17 00:00:00 2001
From: mzegla <milosz.zeglarski@intel.com>
Date: Mon, 28 Oct 2024 10:33:48 +0100
Subject: [PATCH] init

---
 ci/lib_search.py                      |  1 -
 external/BUILD                        |  1 -
 external/cb.patch                     | 18 ------------------
 spelling-whitelist.txt                |  1 -
 src/llm/apis/openai_completions.cpp   | 11 +++++++++++
 src/llm/apis/openai_completions.hpp   |  7 ++++++-
 src/llm/http_llm_calculator.cc        |  2 ++
 src/llm/llm_calculator.proto          |  6 ++++++
 src/llm/llmnoderesources.cpp          |  8 +++++++-
 src/llm/llmnoderesources.hpp          |  1 +
 third_party/llm_engine/llm_engine.bzl |  2 --
 11 files changed, 33 insertions(+), 25 deletions(-)
 delete mode 100644 external/cb.patch
diff --git a/ci/lib_search.py b/ci/lib_search.py
index 05b1ca905a..27578e5028 100644
--- a/ci/lib_search.py
+++ b/ci/lib_search.py
@@ -83,7 +83,6 @@ def check_dir(start_dir):
         '__pycache__',
         'add.xml',
         'azure_sdk.patch',
-        'cb.patch',
         'bazel-',
         'check_coverage.bat',
         'genhtml',
diff --git a/external/BUILD b/external/BUILD
index c6ad55a0d8..0f9b9c8882 100644
--- a/external/BUILD
+++ b/external/BUILD
@@ -47,5 +47,4 @@ exports_files([
     "listen.patch",
     "tf.patch",
     "net_http.patch",
-    "cb.patch",
 ])
\ No newline at end of file
diff --git a/external/cb.patch b/external/cb.patch
deleted file mode 100644
index 110765cc09..0000000000
--- a/external/cb.patch
+++ /dev/null
@@ -1,18 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index b08debb..4171092 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -62,9 +62,9 @@ endif()
-
- add_subdirectory(thirdparty)
- add_subdirectory(src)
--add_subdirectory(samples)
--add_subdirectory(tools/continuous_batching)
--add_subdirectory(tests/cpp)
-+#add_subdirectory(samples)
-+#add_subdirectory(tools/continuous_batching)
-+#add_subdirectory(tests/cpp)
-
- install(FILES LICENSE DESTINATION docs/licensing COMPONENT licensing_genai RENAME LICENSE-GENAI)
- install(FILES third-party-programs.txt DESTINATION docs/licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt)
-
diff --git a/spelling-whitelist.txt b/spelling-whitelist.txt
index 64601be7cc..7b9d63b474 100644
--- a/spelling-whitelist.txt
+++ b/spelling-whitelist.txt
@@ -1,7 +1,6 @@
 client/common/resnet_labels.txt
 demos/common/python/classes.py
 demos/image_classification/go/labels.go
-external/cb.patch
 extras/nginx-mtls-auth/model_server.conf.template
 release_files/thirdparty-licenses/boringssl.LICENSE.txt
 src/shape.cpp:436: strIn
diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp
index 0dfc11c749..e2df56be47 100644
--- a/src/llm/apis/openai_completions.cpp
+++ b/src/llm/apis/openai_completions.cpp
@@ -348,6 +348,17 @@ absl::Status OpenAIChatCompletionsHandler::parseCommonPart(uint32_t maxTokensLim
         request.numReturnSequences = it->value.GetUint();
     }
 
+    // Speculative decoding specific parameters
+    
+    // num_assistant_tokens: uint; optional - defaults to 0
+    it = doc.FindMember("num_assistant_tokens");
+    if (it != doc.MemberEnd()) {
+        if (!it->value.IsUint()) {
+            return absl::InvalidArgumentError("num_assistant_tokens must be an unsigned integer");
+        }
+        request.numAssistantTokens = it->value.GetUint();
+    }
+
     // use_beam_search: bool; optional - defaults to false
     // Extension from vLLM, unsupported by OpenAI API, not available directly in CB lib
     // Use best_of>1 to steer into beams search
diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp
index aba844303a..a33dee0b69 100644
--- a/src/llm/apis/openai_completions.hpp
+++ b/src/llm/apis/openai_completions.hpp
@@ -66,6 +66,7 @@ struct OpenAIChatCompletionsRequest {
     StreamOptions streamOptions;
     std::string model;
     std::optional<int> maxTokens{std::nullopt};
+    std::optional<int> numAssistantTokens{std::nullopt};
     std::optional<float> frequencyPenalty{std::nullopt};
     std::optional<float> presencePenalty{std::nullopt};
     std::optional<float> diversityPenalty{std::nullopt};
@@ -120,7 +121,7 @@ struct OpenAIChatCompletionsRequest {
         // TODO: early_finish = ?
         // TODO use_beam_search is unused ?
 
-        // Multinomial specific
+        // Multinomial sampling specific
         if (temperature.has_value())
             config.temperature = temperature.value();
         if (topK.has_value())
@@ -139,6 +140,10 @@ struct OpenAIChatCompletionsRequest {
             config.presence_penalty = presencePenalty.value();
         config.do_sample = config.temperature > 0.0f && config.num_beams == 1;
 
+        // Speculative decoding specific
+        if (numAssistantTokens.has_value())
+            config.num_assistant_tokens = numAssistantTokens.value();
+
         return config;
     }
 };
diff --git a/src/llm/http_llm_calculator.cc b/src/llm/http_llm_calculator.cc
index 337ec2b0c4..584b7ec6c3 100644
--- a/src/llm/http_llm_calculator.cc
+++ b/src/llm/http_llm_calculator.cc
@@ -208,7 +208,9 @@ class HttpLLMCalculator : public CalculatorBase {
                 if (this->generationHandle->get_status() == ov::genai::GenerationStatus::RUNNING || this->generationHandle->can_read()) {
                     // Subsequent iteration
                     OVMS_PROFILE_SCOPE("Generation of subsequent streaming response");
+                    //SPDLOG_LOGGER_INFO(llm_calculator_logger, "Start read() ...");
                     ov::genai::GenerationOutputs generationOutputs = this->generationHandle->read();
+                    //SPDLOG_LOGGER_INFO(llm_calculator_logger, "End read() ...");
                     RET_CHECK(generationOutputs.size() == 1);  // TODO: Support multiple generations
                     this->apiHandler->incrementProcessedTokens(generationOutputs.begin()->second.generated_ids.size());
 
diff --git a/src/llm/llm_calculator.proto b/src/llm/llm_calculator.proto
index 0a464a348c..d2c88c46aa 100644
--- a/src/llm/llm_calculator.proto
+++ b/src/llm/llm_calculator.proto
@@ -47,4 +47,10 @@ message LLMCalculatorOptions {
     optional uint32 max_tokens_limit = 11 [default = 4096];
 
     optional bool enable_prefix_caching = 12 [default = false];
+
+    // speculative decoding enablement
+
+    optional string draft_models_path = 13;
+
+    optional string draft_models_device = 14 [default = "CPU"];
 }
\ No newline at end of file
diff --git a/src/llm/llmnoderesources.cpp b/src/llm/llmnoderesources.cpp
index b5f23e7f43..6100281455 100644
--- a/src/llm/llmnoderesources.cpp
+++ b/src/llm/llmnoderesources.cpp
@@ -156,6 +156,11 @@ Status LLMNodeResources::initializeLLMNodeResources(std::shared_ptr<LLMNodeResou
 
     nodeResources->device = nodeOptions.device();
 
+    if (!nodeOptions.draft_models_path().empty()) {
+        auto draftModelConfig = ov::genai::draft_model(nodeOptions.draft_models_path(), nodeOptions.draft_models_device());
+        nodeResources->pluginConfig.insert(draftModelConfig);
+    }
+
     auto status = JsonParser::parsePluginConfig(nodeOptions.plugin_config(), nodeResources->pluginConfig);
     if (!status.ok()) {
         SPDLOG_ERROR("Error during llm node plugin_config option parsing to JSON: {}", nodeOptions.plugin_config());
@@ -164,7 +169,8 @@ Status LLMNodeResources::initializeLLMNodeResources(std::shared_ptr<LLMNodeResou
 
     try {
         plugin_config_t tokenizerPluginConfig = {{"PERFORMANCE_HINT", "THROUGHPUT"}};
-        nodeResources->initializeContinuousBatchingPipeline(basePath, nodeResources->schedulerConfig, nodeResources->device, nodeResources->pluginConfig, tokenizerPluginConfig);
+        nodeResources->initializeContinuousBatchingPipeline(basePath, nodeResources->schedulerConfig, nodeResources->device, 
+                                                            nodeResources->pluginConfig, tokenizerPluginConfig);
     } catch (const std::exception& e) {
         SPDLOG_ERROR("Error during llm node initialization for models_path: {} exception: {}", basePath, e.what());
         return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED;
diff --git a/src/llm/llmnoderesources.hpp b/src/llm/llmnoderesources.hpp
index 0b2453bee6..daf39c6d25 100644
--- a/src/llm/llmnoderesources.hpp
+++ b/src/llm/llmnoderesources.hpp
@@ -113,6 +113,7 @@ struct LLMNodeResources {
     int maxTokensLimit;
     int bestOfLimit;
 
+
     static Status initializeLLMNodeResources(std::shared_ptr<LLMNodeResources>& nodeResources, const ::mediapipe::CalculatorGraphConfig::Node& graphNode, std::string graphPath);
     static void loadTextProcessor(std::shared_ptr<LLMNodeResources>& nodeResources, const std::string& chatTemplateDirectory);
 
diff --git a/third_party/llm_engine/llm_engine.bzl b/third_party/llm_engine/llm_engine.bzl
index b444376154..26edfbc033 100644
--- a/third_party/llm_engine/llm_engine.bzl
+++ b/third_party/llm_engine/llm_engine.bzl
@@ -24,8 +24,6 @@ def llm_engine():
         build_file = "@_llm_engine//:BUILD",
         init_submodules = True,
         recursive_init_submodules = True,
-        patch_args = ["-p1"],
-        patches = ["cb.patch"],
     )
     # when using local repository manually run: git submodule update --recursive 
     #native.new_local_repository(