From 5ff955eeffe99ab852836791dcdf072d305252d0 Mon Sep 17 00:00:00 2001 From: mzegla Date: Mon, 28 Oct 2024 10:33:48 +0100 Subject: [PATCH] init --- ci/lib_search.py | 1 - external/BUILD | 1 - external/cb.patch | 18 ------------------ spelling-whitelist.txt | 1 - src/llm/apis/openai_completions.cpp | 11 +++++++++++ src/llm/apis/openai_completions.hpp | 7 ++++++- src/llm/http_llm_calculator.cc | 2 ++ src/llm/llm_calculator.proto | 6 ++++++ src/llm/llmnoderesources.cpp | 8 +++++++- src/llm/llmnoderesources.hpp | 1 + third_party/llm_engine/llm_engine.bzl | 2 -- 11 files changed, 33 insertions(+), 25 deletions(-) delete mode 100644 external/cb.patch diff --git a/ci/lib_search.py b/ci/lib_search.py index 05b1ca905a..27578e5028 100644 --- a/ci/lib_search.py +++ b/ci/lib_search.py @@ -83,7 +83,6 @@ def check_dir(start_dir): '__pycache__', 'add.xml', 'azure_sdk.patch', - 'cb.patch', 'bazel-', 'check_coverage.bat', 'genhtml', diff --git a/external/BUILD b/external/BUILD index c6ad55a0d8..0f9b9c8882 100644 --- a/external/BUILD +++ b/external/BUILD @@ -47,5 +47,4 @@ exports_files([ "listen.patch", "tf.patch", "net_http.patch", - "cb.patch", ]) \ No newline at end of file diff --git a/external/cb.patch b/external/cb.patch deleted file mode 100644 index 110765cc09..0000000000 --- a/external/cb.patch +++ /dev/null @@ -1,18 +0,0 @@ -diff --git a/CMakeLists.txt b/CMakeLists.txt -index b08debb..4171092 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -62,9 +62,9 @@ endif() - - add_subdirectory(thirdparty) - add_subdirectory(src) --add_subdirectory(samples) --add_subdirectory(tools/continuous_batching) --add_subdirectory(tests/cpp) -+#add_subdirectory(samples) -+#add_subdirectory(tools/continuous_batching) -+#add_subdirectory(tests/cpp) - - install(FILES LICENSE DESTINATION docs/licensing COMPONENT licensing_genai RENAME LICENSE-GENAI) - install(FILES third-party-programs.txt DESTINATION docs/licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt) - diff --git a/spelling-whitelist.txt b/spelling-whitelist.txt index 64601be7cc..7b9d63b474 100644 --- a/spelling-whitelist.txt +++ b/spelling-whitelist.txt @@ -1,7 +1,6 @@ client/common/resnet_labels.txt demos/common/python/classes.py demos/image_classification/go/labels.go -external/cb.patch extras/nginx-mtls-auth/model_server.conf.template release_files/thirdparty-licenses/boringssl.LICENSE.txt src/shape.cpp:436: strIn diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 0dfc11c749..e2df56be47 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -348,6 +348,17 @@ absl::Status OpenAIChatCompletionsHandler::parseCommonPart(uint32_t maxTokensLim request.numReturnSequences = it->value.GetUint(); } + // Speculative decoding specific parameters + + // num_assistant_tokens: uint; optional - defaults to 0 + it = doc.FindMember("num_assistant_tokens"); + if (it != doc.MemberEnd()) { + if (!it->value.IsUint()) { + return absl::InvalidArgumentError("num_assistant_tokens must be an unsigned integer"); + } + request.numAssistantTokens = it->value.GetUint(); + } + // use_beam_search: bool; optional - defaults to false // Extension from vLLM, unsupported by OpenAI API, not available directly in CB lib // Use best_of>1 to steer into beams search diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp index aba844303a..a33dee0b69 100644 --- a/src/llm/apis/openai_completions.hpp +++ b/src/llm/apis/openai_completions.hpp @@ -66,6 +66,7 @@ struct OpenAIChatCompletionsRequest { StreamOptions streamOptions; std::string model; std::optional maxTokens{std::nullopt}; + std::optional numAssistantTokens{std::nullopt}; std::optional frequencyPenalty{std::nullopt}; std::optional presencePenalty{std::nullopt}; std::optional diversityPenalty{std::nullopt}; @@ -120,7 +121,7 @@ struct OpenAIChatCompletionsRequest { // TODO: early_finish = ? // TODO use_beam_search is unused ? - // Multinomial specific + // Multinomial sampling specific if (temperature.has_value()) config.temperature = temperature.value(); if (topK.has_value()) @@ -139,6 +140,10 @@ struct OpenAIChatCompletionsRequest { config.presence_penalty = presencePenalty.value(); config.do_sample = config.temperature > 0.0f && config.num_beams == 1; + // Speculative decoding specific + if (numAssistantTokens.has_value()) + config.num_assistant_tokens = numAssistantTokens.value(); + return config; } }; diff --git a/src/llm/http_llm_calculator.cc b/src/llm/http_llm_calculator.cc index 337ec2b0c4..584b7ec6c3 100644 --- a/src/llm/http_llm_calculator.cc +++ b/src/llm/http_llm_calculator.cc @@ -208,7 +208,9 @@ class HttpLLMCalculator : public CalculatorBase { if (this->generationHandle->get_status() == ov::genai::GenerationStatus::RUNNING || this->generationHandle->can_read()) { // Subsequent iteration OVMS_PROFILE_SCOPE("Generation of subsequent streaming response"); + //SPDLOG_LOGGER_INFO(llm_calculator_logger, "Start read() ..."); ov::genai::GenerationOutputs generationOutputs = this->generationHandle->read(); + //SPDLOG_LOGGER_INFO(llm_calculator_logger, "End read() ..."); RET_CHECK(generationOutputs.size() == 1); // TODO: Support multiple generations this->apiHandler->incrementProcessedTokens(generationOutputs.begin()->second.generated_ids.size()); diff --git a/src/llm/llm_calculator.proto b/src/llm/llm_calculator.proto index 0a464a348c..d2c88c46aa 100644 --- a/src/llm/llm_calculator.proto +++ b/src/llm/llm_calculator.proto @@ -47,4 +47,10 @@ message LLMCalculatorOptions { optional uint32 max_tokens_limit = 11 [default = 4096]; optional bool enable_prefix_caching = 12 [default = false]; + + // speculative decoding enablement + + optional string draft_models_path = 13; + + optional string draft_models_device = 14 [default = "CPU"]; } \ No newline at end of file diff --git a/src/llm/llmnoderesources.cpp b/src/llm/llmnoderesources.cpp index b5f23e7f43..6100281455 100644 --- a/src/llm/llmnoderesources.cpp +++ b/src/llm/llmnoderesources.cpp @@ -156,6 +156,11 @@ Status LLMNodeResources::initializeLLMNodeResources(std::shared_ptrdevice = nodeOptions.device(); + if (!nodeOptions.draft_models_path().empty()) { + auto draftModelConfig = ov::genai::draft_model(nodeOptions.draft_models_path(), nodeOptions.draft_models_device()); + nodeResources->pluginConfig.insert(draftModelConfig); + } + auto status = JsonParser::parsePluginConfig(nodeOptions.plugin_config(), nodeResources->pluginConfig); if (!status.ok()) { SPDLOG_ERROR("Error during llm node plugin_config option parsing to JSON: {}", nodeOptions.plugin_config()); @@ -164,7 +169,8 @@ Status LLMNodeResources::initializeLLMNodeResources(std::shared_ptrinitializeContinuousBatchingPipeline(basePath, nodeResources->schedulerConfig, nodeResources->device, nodeResources->pluginConfig, tokenizerPluginConfig); + nodeResources->initializeContinuousBatchingPipeline(basePath, nodeResources->schedulerConfig, nodeResources->device, + nodeResources->pluginConfig, tokenizerPluginConfig); } catch (const std::exception& e) { SPDLOG_ERROR("Error during llm node initialization for models_path: {} exception: {}", basePath, e.what()); return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED; diff --git a/src/llm/llmnoderesources.hpp b/src/llm/llmnoderesources.hpp index 0b2453bee6..daf39c6d25 100644 --- a/src/llm/llmnoderesources.hpp +++ b/src/llm/llmnoderesources.hpp @@ -113,6 +113,7 @@ struct LLMNodeResources { int maxTokensLimit; int bestOfLimit; + static Status initializeLLMNodeResources(std::shared_ptr& nodeResources, const ::mediapipe::CalculatorGraphConfig::Node& graphNode, std::string graphPath); static void loadTextProcessor(std::shared_ptr& nodeResources, const std::string& chatTemplateDirectory); diff --git a/third_party/llm_engine/llm_engine.bzl b/third_party/llm_engine/llm_engine.bzl index b444376154..26edfbc033 100644 --- a/third_party/llm_engine/llm_engine.bzl +++ b/third_party/llm_engine/llm_engine.bzl @@ -24,8 +24,6 @@ def llm_engine(): build_file = "@_llm_engine//:BUILD", init_submodules = True, recursive_init_submodules = True, - patch_args = ["-p1"], - patches = ["cb.patch"], ) # when using local repository manually run: git submodule update --recursive #native.new_local_repository(