dominicshanshan
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h‎
Lines changed: 0 additions & 107 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h‎
Lines changed: 0 additions & 107 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 5 additions & 4 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheType.h‎
Lines changed: 29 additions & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheType.h‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/transformerBuffers.h‎
Lines changed: 1 addition & 2 deletions b/‎cpp/include/tensorrt_llm/batch_manager/transformerBuffers.h‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎cpp/include/tensorrt_llm/executor/executor.h‎
Lines changed: 11 additions & 3 deletions b/‎cpp/include/tensorrt_llm/executor/executor.h‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp‎
Lines changed: 13 additions & 12 deletions b/‎cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp‎
Lines changed: 13 additions & 12 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/transformerBuffers.cpp‎
Lines changed: 2 additions & 3 deletions b/‎cpp/tensorrt_llm/batch_manager/transformerBuffers.cpp‎
Lines changed: 2 additions & 3 deletions
@@ -16,10 +16,11 @@
 
 #pragma once
 
-#include "tensorrt_llm/batch_manager/kvCacheConfig.h"
 #include "tensorrt_llm/batch_manager/kvCacheEventManager.h"
+#include "tensorrt_llm/batch_manager/kvCacheType.h"
 #include "tensorrt_llm/batch_manager/llmRequest.h" // TODO forward declare
 #include "tensorrt_llm/common/optionalRef.h"
+#include "tensorrt_llm/executor/executor.h"
 #include "tensorrt_llm/kernels/kvCacheIndex.h"
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/common.h"
@@ -1309,7 +1310,7 @@ class BaseKVCacheManager
     /// @param config KV cache configuration parameters
     /// @return Tuple containing the {.freePrimaryMemBytes, .freeSecondaryMemBytes}
     [[nodiscard]] static std::tuple<uint64_t, uint64_t> calculateFreeMemBytes(
-        runtime::BufferManager const& bufferManager, KvCacheConfig const& config);
+        runtime::BufferManager const& bufferManager, executor::KvCacheConfig const& config);
 
     /// @brief Calculate the maximum number of KV cache blocks that can be allocated based on available GPU memory.
     /// @details This function computes how many blocks each WindowBlockManager should receive based on the weighted
@@ -1327,8 +1328,8 @@ class BaseKVCacheManager
     /// @param extraCostMemory Additional memory cost to account for CacheTransBufferManager::preAllocBufferSize
     /// @param kvFactor Factor for KV cache size calculation (typically 2 for key+value)
     /// @return Map from window size to tuple of (primary blocks, secondary blocks)
-    [[nodiscard]] static BlocksPerWindow calculateMaxNumBlocks(KvCacheConfig const& config, bool isCrossAttention,
-        nvinfer1::DataType dtype, tensorrt_llm::runtime::ModelConfig const& modelConfig,
+    [[nodiscard]] static BlocksPerWindow calculateMaxNumBlocks(executor::KvCacheConfig const& config,
+        bool isCrossAttention, nvinfer1::DataType dtype, tensorrt_llm::runtime::ModelConfig const& modelConfig,
         tensorrt_llm::runtime::WorldConfig const& worldConfig,
         std::map<SizeType32, std::vector<SizeType32>> const& windowSizeToLayers, uint64_t allottedPrimaryMemBytes,
         uint64_t allottedSecondaryMemBytes, size_t extraCostMemory, SizeType32 kvFactor);
 
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace tensorrt_llm::batch_manager::kv_cache_manager
+{
+
+enum class CacheType
+{
+    kSELF = 0,
+    kCROSS = 1,
+    kSELFKONLY = 2,
+};
+
+} // namespace tensorrt_llm::batch_manager::kv_cache_manager
@@ -17,7 +17,7 @@
 #pragma once
 
 #include "tensorrt_llm/batch_manager/common.h"
-#include "tensorrt_llm/batch_manager/kvCacheConfig.h"
+#include "tensorrt_llm/batch_manager/kvCacheType.h"
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/modelConfig.h"
@@ -43,7 +43,6 @@ class TransformerBuffers
     using SizeType32 = runtime::SizeType32;
     using TensorPtr = runtime::ITensor::SharedPtr;
     using TensorMap = runtime::StringPtrMap<runtime::ITensor>;
-    using KvCacheType = batch_manager::kv_cache_manager::CacheType;
 
     static constexpr auto kCrossAttentionMaskTensorName = "cross_attention_mask";
     static constexpr auto kCrossAttentionPackedMaskTensorName = "cross_attention_packed_mask";
 
@@ -991,15 +991,17 @@ class SchedulerConfig
 class KvCacheConfig
 {
 public:
+    static constexpr auto kDefaultGpuMemFraction = 0.9F;
+
     explicit KvCacheConfig(bool enableBlockReuse = true, std::optional<SizeType32> const& maxTokens = std::nullopt,
         std::optional<std::vector<SizeType32>> const& maxAttentionWindowVec = std::nullopt,
         std::optional<SizeType32> const& sinkTokenLength = std::nullopt,
         std::optional<FloatType> const& freeGpuMemoryFraction = std::nullopt,
         std::optional<size_t> const& hostCacheSize = std::nullopt, bool onboardBlocks = true,
         std::optional<FloatType> const& crossKvCacheFraction = std::nullopt,
         std::optional<RetentionPriority> secondaryOffloadMinPriority = std::nullopt, size_t eventBufferMaxSize = 0,
-        std::optional<tensorrt_llm::runtime::RuntimeDefaults> const& runtimeDefaults = std::nullopt,
-        bool enablePartialReuse = true, bool copyOnPartialReuse = true);
+        bool enablePartialReuse = true, bool copyOnPartialReuse = true, bool useUvm = false,
+        std::optional<tensorrt_llm::runtime::RuntimeDefaults> const& runtimeDefaults = std::nullopt);
 
     [[nodiscard]] bool getEnableBlockReuse() const;
     [[nodiscard]] bool getEnablePartialReuse() const;
@@ -1013,6 +1015,7 @@ class KvCacheConfig
     [[nodiscard]] bool getOnboardBlocks() const;
     [[nodiscard]] std::optional<RetentionPriority> getSecondaryOffloadMinPriority() const;
     [[nodiscard]] size_t getEventBufferMaxSize() const;
+    [[nodiscard]] bool getUseUvm() const;
 
     void setEnableBlockReuse(bool enableBlockReuse);
     void setEnablePartialReuse(bool enablePartialReuse);
@@ -1026,7 +1029,9 @@ class KvCacheConfig
     void setOnboardBlocks(bool onboardBlocks);
     void setSecondaryOffloadMinPriority(std::optional<RetentionPriority> secondaryOffloadMinPriority);
     void setEventBufferMaxSize(size_t eventBufferMaxSize);
-    void fillEmptyFieldsFromRuntimeDefaults(tensorrt_llm::runtime::RuntimeDefaults runtimeDefaults);
+    void setUseUvm(bool useUvm);
+
+    void fillEmptyFieldsFromRuntimeDefaults(tensorrt_llm::runtime::RuntimeDefaults const& runtimeDefaults);
 
 private:
     friend class Serialization;
@@ -1077,6 +1082,9 @@ class KvCacheConfig
 
     /// @brief Whether partially matched blocks that are in use can be reused after copying them
     bool mCopyOnPartialReuse;
+
+    /// @brief Whether to use UVM for the KV cache.
+    bool mUseUvm;
 };
 
 /// @brief Configuration class for the runtime perf knobs
 
@@ -2057,32 +2057,33 @@ std::map<SizeType32, std::vector<SizeType32>> BaseKVCacheManager::groupLayersByW
 }
 
 std::tuple<uint64_t, uint64_t> BaseKVCacheManager::calculateFreeMemBytes(
-    runtime::BufferManager const& bufferManager, KvCacheConfig const& config)
+    runtime::BufferManager const& bufferManager, executor::KvCacheConfig const& config)
 {
-    auto const freeMemFraction = config.freeGpuMemoryFraction.value_or(KvCacheConfig::kDefaultGpuMemFraction);
+    auto const freeMemFraction
+        = config.getFreeGpuMemoryFraction().value_or(executor::KvCacheConfig::kDefaultGpuMemFraction);
     TLLM_CHECK_WITH_INFO(freeMemFraction < 1.0F,
         "Invalid freeMemFraction, freeMemFraction (%f) must be smaller than 1.0f", freeMemFraction);
-    if (config.maxTokens.has_value())
+    if (config.getMaxTokens().has_value())
     {
-        if (config.freeGpuMemoryFraction.has_value())
+        if (config.getFreeGpuMemoryFraction().has_value())
         {
             TLLM_LOG_WARNING(
                 "Both freeGpuMemoryFraction (aka kv_cache_free_gpu_mem_fraction) "
                 "and maxTokens (aka max_tokens_in_paged_kv_cache) "
                 "are set (to %f and %ld, respectively). The smaller value will be used.",
-                freeMemFraction, (int64_t) config.maxTokens.value());
+                freeMemFraction, (int64_t) config.getMaxTokens().value());
         }
     }
 
     TLLM_CUDA_CHECK(::cudaDeviceSynchronize());
-    auto const [freeMem, totalMem] = tc::getDeviceMemoryInfo(config.useUvm);
+    auto const [freeMem, totalMem] = tc::getDeviceMemoryInfo(config.getUseUvm());
     auto const finalFreeMem = freeMem + bufferManager.memoryPoolFree();
     TLLM_LOG_INFO("Memory usage when calculating max tokens in paged kv cache: total: %0.2f GiB, available: %0.2f GiB",
         totalMem / static_cast<double>(1 << 30), finalFreeMem / static_cast<double>(1 << 30));
     TLLM_CHECK_WITH_INFO(finalFreeMem <= totalMem, "Free memory cannot exceed total memory");
 
     auto const freePrimaryMemBytes = static_cast<uint64_t>(finalFreeMem * freeMemFraction);
-    auto const freeSecondaryMemBytes = config.hostCacheSize.value_or(0);
+    auto const freeSecondaryMemBytes = config.getHostCacheSize().value_or(0);
 
     TLLM_LOG_DEBUG("Calculated free memory: {.freePrimaryMemBytes=%" PRIu64 ", .freeSecondaryMemBytes=%" PRIu64 "}",
         freePrimaryMemBytes, freeSecondaryMemBytes);
@@ -2120,7 +2121,7 @@ bool isSortedVectorIdenticalAcrossAllRanks(WorldConfig const& worldConfig, std::
 }
 } // namespace
 
-BlocksPerWindow BaseKVCacheManager::calculateMaxNumBlocks(KvCacheConfig const& config, bool isCrossAttention,
+BlocksPerWindow BaseKVCacheManager::calculateMaxNumBlocks(executor::KvCacheConfig const& config, bool isCrossAttention,
     nvinfer1::DataType dtype, ModelConfig const& modelConfig, WorldConfig const& worldConfig,
     std::map<SizeType32, std::vector<SizeType32>> const& windowSizeToLayers, uint64_t allottedPrimaryMemBytes,
     uint64_t allottedSecondaryMemBytes, size_t extraCostMemory, SizeType32 kvFactor)
@@ -2130,7 +2131,7 @@ BlocksPerWindow BaseKVCacheManager::calculateMaxNumBlocks(KvCacheConfig const& c
         isCrossAttention ? "Cross KvCacheManager" : "Self KvCacheManager", allottedPrimaryMemBytes,
         allottedSecondaryMemBytes);
 
-    if (config.maxTokens.has_value() && windowSizeToLayers.size() > 1)
+    if (config.getMaxTokens().has_value() && windowSizeToLayers.size() > 1)
     {
         TLLM_LOG_WARNING(
             "Setting maxTokens when using Variable Sliding Window Attention is a strange concept, as it limits "
@@ -2162,9 +2163,9 @@ BlocksPerWindow BaseKVCacheManager::calculateMaxNumBlocks(KvCacheConfig const& c
         TLLM_LOG_DEBUG("windowSizeShare: %f, cacheSizeBytesPerToken: %d", windowSizeShare, cacheSizeBytesPerToken);
         auto maxTokens = static_cast<uint64_t>(
             allottedPrimaryMemBytes * windowSizeShare / static_cast<double>(cacheSizeBytesPerToken));
-        if (config.maxTokens.has_value())
+        if (config.getMaxTokens().has_value())
         {
-            auto const maxTokensFromConfig = static_cast<uint64_t>(config.maxTokens.value());
+            auto const maxTokensFromConfig = static_cast<uint64_t>(config.getMaxTokens().value());
             TLLM_LOG_DEBUG("Maximum kv-cache token overridden by configuration as '%ld'.", maxTokensFromConfig);
             maxTokens = std::min(maxTokensFromConfig, maxTokens);
         }
@@ -2184,7 +2185,7 @@ BlocksPerWindow BaseKVCacheManager::calculateMaxNumBlocks(KvCacheConfig const& c
         TLLM_LOG_DEBUG(
             "Number of blocks in KV cache secondary pool for windowSize %d: %d, onboard blocks to primary memory "
             "before reuse: %s",
-            windowSize, blocksInSecondaryPool, config.onboardBlocks ? "true" : "false");
+            windowSize, blocksInSecondaryPool, config.getOnboardBlocks() ? "true" : "false");
         return blocksInSecondaryPool;
     };
 
 
@@ -17,7 +17,6 @@
 
 #include "tensorrt_llm/batch_manager/transformerBuffers.h"
 
-#include "tensorrt_llm/batch_manager/kvCacheConfig.h"
 #include "tensorrt_llm/batch_manager/kvCacheManager.h"
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/logger.h"
@@ -221,7 +220,7 @@ void TransformerBuffers::reshapeKvTensors(SizeType32 maxBatchSize, SizeType32 ma
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
 
     // allocate with max shape during init
-    if (kvCacheType == KvCacheType::kSELF)
+    if (kvCacheType == kv_cache_manager::CacheType::kSELF)
     {
         auto const cacheBlockOffsetsShape
             = ITensor::makeShape({numPools, maxBatchSize * maxBeamWidth, 2, maxBlocksPerSeq});
@@ -232,7 +231,7 @@ void TransformerBuffers::reshapeKvTensors(SizeType32 maxBatchSize, SizeType32 ma
         kvCacheBlockOffsetsDevice->reshape(cacheBlockOffsetsShape);
         manager.setZero(*kvCacheBlockOffsetsDevice);
     }
-    else if (kvCacheType == KvCacheType::kCROSS)
+    else if (kvCacheType == kv_cache_manager::CacheType::kCROSS)
     {
         auto const crossCacheBlockOffsetsShape
             = ITensor::makeShape({numPools, maxBatchSize * maxBeamWidth, 2, maxBlocksPerSeq});