@@ -2057,32 +2057,33 @@ std::map<SizeType32, std::vector<SizeType32>> BaseKVCacheManager::groupLayersByW
2057
2057
}
2058
2058
2059
2059
std::tuple<uint64_t , uint64_t > BaseKVCacheManager::calculateFreeMemBytes (
2060
- runtime::BufferManager const & bufferManager, KvCacheConfig const & config)
2060
+ runtime::BufferManager const & bufferManager, executor:: KvCacheConfig const & config)
2061
2061
{
2062
- auto const freeMemFraction = config.freeGpuMemoryFraction .value_or (KvCacheConfig::kDefaultGpuMemFraction );
2062
+ auto const freeMemFraction
2063
+ = config.getFreeGpuMemoryFraction ().value_or (executor::KvCacheConfig::kDefaultGpuMemFraction );
2063
2064
TLLM_CHECK_WITH_INFO (freeMemFraction < 1 .0F ,
2064
2065
" Invalid freeMemFraction, freeMemFraction (%f) must be smaller than 1.0f" , freeMemFraction);
2065
- if (config.maxTokens .has_value ())
2066
+ if (config.getMaxTokens () .has_value ())
2066
2067
{
2067
- if (config.freeGpuMemoryFraction .has_value ())
2068
+ if (config.getFreeGpuMemoryFraction () .has_value ())
2068
2069
{
2069
2070
TLLM_LOG_WARNING (
2070
2071
" Both freeGpuMemoryFraction (aka kv_cache_free_gpu_mem_fraction) "
2071
2072
" and maxTokens (aka max_tokens_in_paged_kv_cache) "
2072
2073
" are set (to %f and %ld, respectively). The smaller value will be used." ,
2073
- freeMemFraction, (int64_t ) config.maxTokens .value ());
2074
+ freeMemFraction, (int64_t ) config.getMaxTokens () .value ());
2074
2075
}
2075
2076
}
2076
2077
2077
2078
TLLM_CUDA_CHECK (::cudaDeviceSynchronize ());
2078
- auto const [freeMem, totalMem] = tc::getDeviceMemoryInfo (config.useUvm );
2079
+ auto const [freeMem, totalMem] = tc::getDeviceMemoryInfo (config.getUseUvm () );
2079
2080
auto const finalFreeMem = freeMem + bufferManager.memoryPoolFree ();
2080
2081
TLLM_LOG_INFO (" Memory usage when calculating max tokens in paged kv cache: total: %0.2f GiB, available: %0.2f GiB" ,
2081
2082
totalMem / static_cast <double >(1 << 30 ), finalFreeMem / static_cast <double >(1 << 30 ));
2082
2083
TLLM_CHECK_WITH_INFO (finalFreeMem <= totalMem, " Free memory cannot exceed total memory" );
2083
2084
2084
2085
auto const freePrimaryMemBytes = static_cast <uint64_t >(finalFreeMem * freeMemFraction);
2085
- auto const freeSecondaryMemBytes = config.hostCacheSize .value_or (0 );
2086
+ auto const freeSecondaryMemBytes = config.getHostCacheSize () .value_or (0 );
2086
2087
2087
2088
TLLM_LOG_DEBUG (" Calculated free memory: {.freePrimaryMemBytes=%" PRIu64 " , .freeSecondaryMemBytes=%" PRIu64 " }" ,
2088
2089
freePrimaryMemBytes, freeSecondaryMemBytes);
@@ -2120,7 +2121,7 @@ bool isSortedVectorIdenticalAcrossAllRanks(WorldConfig const& worldConfig, std::
2120
2121
}
2121
2122
} // namespace
2122
2123
2123
- BlocksPerWindow BaseKVCacheManager::calculateMaxNumBlocks (KvCacheConfig const & config, bool isCrossAttention,
2124
+ BlocksPerWindow BaseKVCacheManager::calculateMaxNumBlocks (executor:: KvCacheConfig const & config, bool isCrossAttention,
2124
2125
nvinfer1::DataType dtype, ModelConfig const & modelConfig, WorldConfig const & worldConfig,
2125
2126
std::map<SizeType32, std::vector<SizeType32>> const & windowSizeToLayers, uint64_t allottedPrimaryMemBytes,
2126
2127
uint64_t allottedSecondaryMemBytes, size_t extraCostMemory, SizeType32 kvFactor)
@@ -2130,7 +2131,7 @@ BlocksPerWindow BaseKVCacheManager::calculateMaxNumBlocks(KvCacheConfig const& c
2130
2131
isCrossAttention ? " Cross KvCacheManager" : " Self KvCacheManager" , allottedPrimaryMemBytes,
2131
2132
allottedSecondaryMemBytes);
2132
2133
2133
- if (config.maxTokens .has_value () && windowSizeToLayers.size () > 1 )
2134
+ if (config.getMaxTokens () .has_value () && windowSizeToLayers.size () > 1 )
2134
2135
{
2135
2136
TLLM_LOG_WARNING (
2136
2137
" Setting maxTokens when using Variable Sliding Window Attention is a strange concept, as it limits "
@@ -2162,9 +2163,9 @@ BlocksPerWindow BaseKVCacheManager::calculateMaxNumBlocks(KvCacheConfig const& c
2162
2163
TLLM_LOG_DEBUG (" windowSizeShare: %f, cacheSizeBytesPerToken: %d" , windowSizeShare, cacheSizeBytesPerToken);
2163
2164
auto maxTokens = static_cast <uint64_t >(
2164
2165
allottedPrimaryMemBytes * windowSizeShare / static_cast <double >(cacheSizeBytesPerToken));
2165
- if (config.maxTokens .has_value ())
2166
+ if (config.getMaxTokens () .has_value ())
2166
2167
{
2167
- auto const maxTokensFromConfig = static_cast <uint64_t >(config.maxTokens .value ());
2168
+ auto const maxTokensFromConfig = static_cast <uint64_t >(config.getMaxTokens () .value ());
2168
2169
TLLM_LOG_DEBUG (" Maximum kv-cache token overridden by configuration as '%ld'." , maxTokensFromConfig);
2169
2170
maxTokens = std::min (maxTokensFromConfig, maxTokens);
2170
2171
}
@@ -2184,7 +2185,7 @@ BlocksPerWindow BaseKVCacheManager::calculateMaxNumBlocks(KvCacheConfig const& c
2184
2185
TLLM_LOG_DEBUG (
2185
2186
" Number of blocks in KV cache secondary pool for windowSize %d: %d, onboard blocks to primary memory "
2186
2187
" before reuse: %s" ,
2187
- windowSize, blocksInSecondaryPool, config.onboardBlocks ? " true" : " false" );
2188
+ windowSize, blocksInSecondaryPool, config.getOnboardBlocks () ? " true" : " false" );
2188
2189
return blocksInSecondaryPool;
2189
2190
};
2190
2191
0 commit comments