NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h‎
Lines changed: 1 addition & 5 deletions b/‎cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/assignReqSeqSlots.cpp‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/batch_manager/assignReqSeqSlots.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp‎
Lines changed: 4 additions & 13 deletions b/‎cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp‎
Lines changed: 4 additions & 13 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp‎
Lines changed: 0 additions & 21 deletions b/‎cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp‎
Lines changed: 0 additions & 21 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/sequenceSlotManager.cpp‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/batch_manager/sequenceSlotManager.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp‎
Lines changed: 10 additions & 5 deletions b/‎cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/speculativeDecoding/common.cu‎
Lines changed: 6 additions & 22 deletions b/‎cpp/tensorrt_llm/kernels/speculativeDecoding/common.cu‎
Lines changed: 6 additions & 22 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/speculativeDecoding/common.h‎
Lines changed: 3 additions & 7 deletions b/‎cpp/tensorrt_llm/kernels/speculativeDecoding/common.h‎
Lines changed: 3 additions & 7 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.cu‎
Lines changed: 11 additions & 30 deletions b/‎cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.cu‎
Lines changed: 11 additions & 30 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h‎
Lines changed: 4 additions & 10 deletions b/‎cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h‎
Lines changed: 4 additions & 10 deletions
@@ -232,13 +232,9 @@ class RuntimeBuffers
 
     GenerationLogitsCache generationLogitsCache;
 
-    //! Helper for KV cache rewind
+    //! Mapping from batch idx to slot id
     TensorPtr seqSlots;
     TensorPtr seqSlotsDevice;
-    TensorPtr sortedSeqSlots;
-    //! For KV cache rewind
-    TensorPtr seqSlotRemappingHost;   // [numSequences]
-    TensorPtr seqSlotRemappingDevice; // [numSequences]
 
     //! Explicitly device-copy src offsets to reduce warp stalls in copy batch kernel invocation
     //! [mMaxNumRequests], on gpu
 
@@ -37,7 +37,7 @@ void tensorrt_llm::batch_manager::AssignReqSeqSlots::operator()(SequenceSlotMana
                 llmReq->setFirstScheduledTime();
             }
             auto const reqSeqSlot = seqSlotManager.getSequenceSlot(isReqNew, llmReq->mRequestId);
-            TLLM_CHECK_WITH_INFO(reqSeqSlot, "Unable to get batch slot for reqId");
+            TLLM_CHECK_WITH_INFO(reqSeqSlot, "Unable to get batch slot for request ID %lu", llmReq->mRequestId);
             llmReq->mSeqSlot = reqSeqSlot;
         }
     }
 
@@ -92,29 +92,20 @@ namespace
 std::pair<std::vector<SizeType32>, std::vector<SizeType32>> getActiveSlots(
     RequestVector const& contextRequests, RequestVector const& generationRequests)
 {
-    std::vector<std::pair<SizeType32, SizeType32>> slots;
+    std::vector<SizeType32> activeSlots;
+    std::vector<SizeType32> generationSteps;
     for (auto const& requests : {contextRequests, generationRequests})
     {
         for (auto const& llmReq : requests)
         {
             if (llmReq->isGenerationInProgressState() || llmReq->isLastContextChunk())
             {
-                slots.push_back({llmReq->mSeqSlot.value(), llmReq->getDecodingIter()});
+                activeSlots.push_back(llmReq->mSeqSlot.value());
+                generationSteps.push_back(llmReq->getDecodingIter());
             }
         }
     }
 
-    std::sort(slots.begin(), slots.end(),
-        [](std::pair<SizeType32, SizeType32> const& a, std::pair<SizeType32, SizeType32> const& b)
-        { return a.first < b.first; });
-
-    std::vector<SizeType32> activeSlots, generationSteps;
-    for (auto const& slot : slots)
-    {
-        activeSlots.push_back(slot.first);
-        generationSteps.push_back(slot.second);
-    }
-
     return {activeSlots, generationSteps};
 }
 
 
@@ -104,9 +104,6 @@ void RuntimeBuffers::create(SizeType32 maxBatchSize, SizeType32 maxBeamWidth,
         logits = manager.emptyTensor(MemoryType::kGPU, logitsType);
     }
 
-    seqSlotRemappingHost = manager.emptyTensor(MemoryType::kPINNEDPOOL, nvinfer1::DataType::kINT32);
-    seqSlotRemappingDevice = manager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32);
-
     // TODO: check which tensors can be allocated as pinned for max size
     requestTypes = manager.emptyTensor(MemoryType::kCPU, TRTDataType<runtime::RequestType>::value);
 
@@ -129,7 +126,6 @@ void RuntimeBuffers::create(SizeType32 maxBatchSize, SizeType32 maxBeamWidth,
     auto const maxBatchSizeShape = ITensor::makeShape({maxBatchSize});
     seqSlots = tensorrt_llm::runtime::BufferManager::pinnedPool(maxBatchSizeShape, nvinfer1::DataType::kINT32);
     seqSlotsDevice = manager.gpu(maxBatchSizeShape, nvinfer1::DataType::kINT32);
-    sortedSeqSlots = tensorrt_llm::runtime::BufferManager::pinnedPool(maxBatchSizeShape, nvinfer1::DataType::kINT32);
 
     cacheIndirDecoderIOBatchedCopySrcOffsets
         = tensorrt_llm::runtime::BufferManager::pinnedPool(maxBatchSizeShape, nvinfer1::DataType::kINT64);
@@ -383,9 +379,6 @@ void RuntimeBuffers::reshape(TllmRuntime const& runtime, ModelConfig const& mode
     auto const numRequestsShape = ITensor::makeShape({numRequests});
     seqSlots->reshape(numRequestsShape);
     seqSlotsDevice->reshape(numRequestsShape);
-    sortedSeqSlots->reshape(numRequestsShape);
-    seqSlotRemappingHost->reshape(numRequestsShape);
-    seqSlotRemappingDevice->reshape(numRequestsShape);
 
     auto const numTokens = getNumTokens();
     inputsIds->reshape(ITensor::makeShape({numTokens}));
@@ -740,20 +733,6 @@ void RuntimeBuffers::setFromInputs(RequestVector const& contextRequests, Request
             std::fill_n(sequenceLengthsHostPtr + numSequences, reqBeamWidth, sequenceLen);
             numSequences += reqBeamWidth;
         }
-        if (modelConfig.getSpeculativeDecodingMode().needsKVCacheRewind())
-        {
-            auto remappingSeqSlotIndices = BufferRange<SizeType32>(*seqSlotRemappingHost);
-            auto const* seqSlotIndices = bufferCast<SizeType32>(*seqSlots);
-
-            std::iota(remappingSeqSlotIndices.begin(), remappingSeqSlotIndices.end(), 0);
-            std::sort(remappingSeqSlotIndices.begin(), remappingSeqSlotIndices.end(),
-                [&seqSlotIndices](SizeType32 a, SizeType32 b) { return seqSlotIndices[a] < seqSlotIndices[b]; });
-            manager.copy(*seqSlotRemappingHost, *seqSlotRemappingDevice);
-
-            manager.copy(*seqSlots, *sortedSeqSlots);
-            auto sortedSeqSlotIndices = BufferRange<SizeType32>(*sortedSeqSlots);
-            std::sort(sortedSeqSlotIndices.begin(), sortedSeqSlotIndices.end());
-        }
         if (modelConfig.getSpeculativeDecodingMode().isLookaheadDecoding())
         {
             // copy from lookahead decoding buffer
 
@@ -61,7 +61,7 @@ std::optional<SequenceSlotManager::SlotIdType> SequenceSlotManager::getSequenceS
         auto const it = mSequenceIdToSlot.find(sequenceId);
         if (it == mSequenceIdToSlot.end())
         {
-            TLLM_LOG_ERROR("Could not find sequence id in allocated sequence slots");
+            TLLM_LOG_ERROR("Could not find sequence id %lu in allocated sequence slots", sequenceId);
         }
         else
         {
 
@@ -983,8 +983,10 @@ void TrtGptModelInflightBatching::forwardAsync(RequestList const& activeRequests
         if (fittingRequests.empty() && fittingDisaggGenInitRequests.empty())
         {
             TLLM_LOG_WARNING(
-                "CapacityScheduler didn't schedule any requests, probably because of insufficient resources such as KV "
-                "cache, will try wait for KV cache transfer to complete");
+                "CapacityScheduler didn't schedule any requests in iteration %lu, "
+                "probably because of insufficient resources such as KV cache, "
+                "will try wait for KV cache transfer to complete",
+                mIterCounter);
             if (mCacheTransceiver)
             {
                 mCacheTransceiver->checkContextTransferStatus(1);
@@ -1038,6 +1040,10 @@ void TrtGptModelInflightBatching::forwardAsync(RequestList const& activeRequests
                 auto const contextBufferId = mCtxGenFusion ? getFusedBufferId() : getContextBufferId();
                 setupDecoderStep(currRequests.contextRequests, *mBuffers.at(contextBufferId),
                     mDecoderInputBuffers.at(getFusedBufferId()));
+                // WAR: Sync to ensure that the decoder setup is complete before the context phase starts.
+                // Without this, there may be a race condition between the decoder setup and the context phase
+                // which also leads to spurious test failure in trtGptModelRealDecoderTest.
+                mRuntime->getStream().synchronize();
             }
             else
             {
@@ -2432,9 +2438,8 @@ void TrtGptModelInflightBatching::rewindKVCacheBlocks(SizeType32 numSequences)
     tensorrt_llm::runtime::kernels::invokeUpdateKVBlockArrayDraftTokenLocation(
         *mDecoderState->getAcceptedLengthsCumSum(), *mDecoderState->getAcceptedPackedPaths(),
         *runtimeBuffers.sequenceLengthsDevice, pointerArrayPtr, offsetArrayPtr, localNbLayers, numSequences,
-        mRewindInputs.numKvHeads, sizeInBytesPerKVHead, commonRewindLen, rewindLens,
-        *runtimeBuffers.seqSlotRemappingDevice, *runtimeBuffers.sortedSeqSlots, getMaxAttentionWindow(),
-        mRewindInputs.maxBlocksPerSeq, tokensPerBlock, mRewindInputs.isUseOneMoreBlock,
+        mRewindInputs.numKvHeads, sizeInBytesPerKVHead, commonRewindLen, rewindLens, *runtimeBuffers.seqSlots,
+        getMaxAttentionWindow(), mRewindInputs.maxBlocksPerSeq, tokensPerBlock, mRewindInputs.isUseOneMoreBlock,
         mRuntime->getStreamPtr()->get());
 
     sync_check_cuda_error(mRuntime->getStream().get());
 
@@ -40,8 +40,8 @@ namespace tensorrt_llm::kernels::speculative_decoding
 template <int32_t BLOCK_SIZE>
 __global__ void packAcceptedPaths(SizeType32* acceptedLengthsCumSum, SizeType32* pathsOffsets,
     SizeType32 const* acceptedLengths, SizeType32 const* bestPathIds, SizeType32 const* paths,
-    SizeType32 const* batchSlots, runtime::SizeType32 const* seqSlots, SizeType32 batchSize, SizeType32 engineBatchSize,
-    SizeType32 numPaths, SizeType32 maxPathLen, bool isPathsSeqSlotIdx)
+    SizeType32 const* batchSlots, SizeType32 batchSize, SizeType32 engineBatchSize, SizeType32 numPaths,
+    SizeType32 maxPathLen, bool isPathsSeqSlotIdx)
 {
     // Specialize BlockScan for a 1D block of 128 threads of type int
     typedef cub::BlockScan<SizeType32, BLOCK_SIZE> BlockScan;
@@ -81,22 +81,7 @@ __global__ void packAcceptedPaths(SizeType32* acceptedLengthsCumSum, SizeType32*
         }
         __syncthreads();
 
-        int32_t pathBatchIdx{batchSlot};
-        if (isPathsSeqSlotIdx)
-        {
-            // If paths tensor is the tensor arranged according to seq slot,
-            // we must find the position of the batchSlots index in the seq slot array.
-            // TODO optimize it.
-            for (int bi = 0; bi < batchSize; ++bi)
-            {
-                auto const seqSlot = seqSlots[bi];
-                if (batchSlot == seqSlot)
-                {
-                    pathBatchIdx = bi;
-                    break;
-                }
-            }
-        }
+        auto const pathBatchIdx = isPathsSeqSlotIdx ? bi : batchSlot;
 
         if (valid)
         {
@@ -117,13 +102,12 @@ __global__ void packAcceptedPaths(SizeType32* acceptedLengthsCumSum, SizeType32*
 
 void invokePackAcceptedPaths(SizeType32* acceptedLengthsCumSum, SizeType32* pathsOffsets,
     SizeType32 const* acceptedLengths, SizeType32 const* bestPathIds, SizeType32 const* paths,
-    SizeType32 const* batchSlots, runtime::SizeType32 const* seqSlots, SizeType32 batchSize, SizeType32 engineBatchSize,
-    SizeType32 numPaths, SizeType32 maxPathLen, bool isPathsLinearBatchIdx, cudaStream_t stream)
+    SizeType32 const* batchSlots, SizeType32 batchSize, SizeType32 engineBatchSize, SizeType32 numPaths,
+    SizeType32 maxPathLen, bool isPathsSeqSlotIdx, cudaStream_t stream)
 {
     constexpr SizeType32 BLOCK_SIZE = 1024;
     packAcceptedPaths<BLOCK_SIZE><<<1, BLOCK_SIZE, 0, stream>>>(acceptedLengthsCumSum, pathsOffsets, acceptedLengths,
-        bestPathIds, paths, batchSlots, seqSlots, batchSize, engineBatchSize, numPaths, maxPathLen,
-        isPathsLinearBatchIdx);
+        bestPathIds, paths, batchSlots, batchSize, engineBatchSize, numPaths, maxPathLen, isPathsSeqSlotIdx);
 }
 
 namespace
 
@@ -38,10 +38,6 @@ namespace tensorrt_llm::kernels::speculative_decoding
 //! everything that is not path.
 //! \param batchSlots input buffer [engineBatchSize], address map from local index to
 //! global index [0, batchSize] -> [0, maxBatchSize].
-//! This is in the order of increasing order of the requests in the decoder.
-//! \param seqSlots input buffer [engineBatchSize], address map from local index to
-//! global index [0, batchSize] -> [0, maxBatchSize]
-//! These are the slots of the sequences in the runtime buffers.
 //! \param batchSize the number of sequences to be decoded
 //! \param engineBatchSize number of sequences processed in the engine.
 //! Includes chunked context reqs that are not in the last chunk.
@@ -52,9 +48,9 @@ namespace tensorrt_llm::kernels::speculative_decoding
 //! \param stream stream
 void invokePackAcceptedPaths(runtime::SizeType32* acceptedLengthsCumSum, runtime::SizeType32* pathsOffsets,
     runtime::SizeType32 const* acceptedLengths, runtime::SizeType32 const* bestPathIds,
-    runtime::SizeType32 const* paths, runtime::SizeType32 const* batchSlots, runtime::SizeType32 const* seqSlots,
-    runtime::SizeType32 batchSize, runtime::SizeType32 engineBatchSize, runtime::SizeType32 numPaths,
-    runtime::SizeType32 maxPathLen, bool isPathsSeqSlotIdx, cudaStream_t stream);
+    runtime::SizeType32 const* paths, runtime::SizeType32 const* batchSlots, runtime::SizeType32 batchSize,
+    runtime::SizeType32 engineBatchSize, runtime::SizeType32 numPaths, runtime::SizeType32 maxPathLen,
+    bool isPathsSeqSlotIdx, cudaStream_t stream);
 
 template <typename T>
 struct AcceptDraftTokensByIdsWithPathsParams
 
@@ -1372,49 +1372,30 @@ void invokeGetPackedMaskFromPath(int32_t* specDecodingPackedMasks, SizeType32 co
 namespace
 {
 template <int BLOCK_SIZE>
-__global__ void augmentBatchSlotsKernel(SizeType32* augmentedSeqSlots, SizeType32* augmentedBatchSlots,
-    SizeType32 const* chunkedContextNextTokens, SizeType32 const* lastDraftLens, SizeType32 const* seqSlots,
-    SizeType32 const* batchSlots, SizeType32 actualBatchSize)
+__global__ void augmentBatchSlotsKernel(SizeType32* augmentedSeqSlots, SizeType32 const* chunkedContextNextTokens,
+    SizeType32 const* lastDraftLens, SizeType32 const* seqSlots, SizeType32 engineBatchSize)
 {
-    typedef cub::BlockScan<SizeType32, BLOCK_SIZE> BlockScan;
-    __shared__ typename BlockScan::TempStorage tempStorage;
-
     auto const batchIdx = static_cast<SizeType32>(threadIdx.x);
-    auto const valid = batchIdx < actualBatchSize;
+    auto const valid = batchIdx < engineBatchSize;
 
-    bool needDecoding{false};
     if (valid)
     {
         auto const draftLen = lastDraftLens[batchIdx];
-        needDecoding = (draftLen == 0 && chunkedContextNextTokens[batchIdx] == -1) || (draftLen > 0);
-    }
-
-    SizeType32 originalIndex{0};
-    BlockScan(tempStorage).ExclusiveSum(needDecoding, originalIndex);
-
-    if (needDecoding)
-    {
-        augmentedSeqSlots[batchIdx] = seqSlots[batchIdx];
-        augmentedBatchSlots[batchIdx] = batchSlots[originalIndex];
-    }
-    else if (valid)
-    {
-        augmentedSeqSlots[batchIdx] = -1;
-        augmentedBatchSlots[batchIdx] = -1;
+        auto const needDecoding = (draftLen == 0 && chunkedContextNextTokens[batchIdx] == -1) || (draftLen > 0);
+        augmentedSeqSlots[batchIdx] = needDecoding ? seqSlots[batchIdx] : -1;
     }
 }
 } // namespace
 
-void invokeAugmentBatchSlots(SizeType32* augmentedSeqSlots, SizeType32* augmentedBatchSlots,
-    runtime::SizeType32 const* chunkedContextNextTokens, runtime::SizeType32 const* lastDraftLens,
-    SizeType32 const* seqSlots, SizeType32 const* batchSlots, SizeType32 actualBatchSize, SizeType32 batchSize,
-    cudaStream_t stream)
+void invokeAugmentBatchSlots(SizeType32* augmentedSeqSlots, runtime::SizeType32 const* chunkedContextNextTokens,
+    runtime::SizeType32 const* lastDraftLens, SizeType32 const* seqSlots, SizeType32 engineBatchSize,
+    SizeType32 batchSize, cudaStream_t stream)
 {
     SizeType32 constexpr BLOCK_SIZE = 512;
     TLLM_CHECK_WITH_INFO(
-        actualBatchSize <= BLOCK_SIZE, "Batch size larger than %d is not supported for EAGLE yet", batchSize);
-    augmentBatchSlotsKernel<BLOCK_SIZE><<<1, BLOCK_SIZE, 0, stream>>>(augmentedSeqSlots, augmentedBatchSlots,
-        chunkedContextNextTokens, lastDraftLens, seqSlots, batchSlots, actualBatchSize);
+        engineBatchSize <= BLOCK_SIZE, "Batch size larger than %d is not supported for EAGLE yet", batchSize);
+    augmentBatchSlotsKernel<BLOCK_SIZE><<<1, BLOCK_SIZE, 0, stream>>>(
+        augmentedSeqSlots, chunkedContextNextTokens, lastDraftLens, seqSlots, engineBatchSize);
 }
 
 namespace
 
@@ -552,29 +552,23 @@ void invokeCopyOutputTokensIds(runtime::TokenIdType const* const* tmpOutputIdsPt
     runtime::SizeType32 const* inputPaths, runtime::SizeType32* outputPaths, runtime::SizeType32 maxPathLen,
     cudaStream_t stream);
 
-//! \brief Augment seq slots and batch slots from batchSize size to engineBatchSize size.
-//! For seqSlot sets -1 for non-last chunks (chunkedContextNextTokens != -1).
-//! For batchSlots sets -1 for non-last chunks. Copies actual batch slots to the last chunk or gen requests
-//! positions.
+//! \brief Augment seq slots so that non-last chunks are set to -1 (if chunkedContextNextTokens != -1).
 //!
 //! \param augmentedSeqSlots output buffer [engineBatchSize]
-//! \param augmentedBatchSlots output buffer [engineBatchSize]
 //! \param chunkedContextNextTokens input buffer [engineBatchSize], indicator of the not last chunk of the ctx
 //! requests. -1 for gen requests and last chunk, != -1 otherwise.
 //! \param lastDraftLens input buffer [engineBatchSize], number of draft tokens input to the current iteration.
 //! 0 for ctx requests and > 0 for gen requests.
 //! \param seqSlots input buffer [engineBatchSize], address map from local index to global index [0, batchSize]
 //! -> [0, maxBatchSize]
-//! \param batchSlots input buffer [engineBatchSize], address map from local index to global index [0, batchSize]
-//! -> [0, maxBatchSize]
 //! \param engineBatchSize number of sequences processed in the engine.
 //! Includes chunked context reqs that are not in the last chunk.
 //! \param batchSize the number of sequences to be decoded
 //! \param stream cuda stream.
-void invokeAugmentBatchSlots(runtime::SizeType32* augmentedSeqSlots, runtime::SizeType32* augmentedBatchSlots,
+void invokeAugmentBatchSlots(runtime::SizeType32* augmentedSeqSlots,
     runtime::SizeType32 const* chunkedContextNextTokens, runtime::SizeType32 const* lastDraftLens,
-    runtime::SizeType32 const* seqSlots, runtime::SizeType32 const* batchSlots, runtime::SizeType32 engineBatchSize,
-    runtime::SizeType32 batchSize, cudaStream_t stream);
+    runtime::SizeType32 const* seqSlots, runtime::SizeType32 engineBatchSize, runtime::SizeType32 batchSize,
+    cudaStream_t stream);
 
 //! \brief For Eagle-2, set topK tensor according to the max topK value for each request.
 //! And fill the batchSlots for the softMax kernel.
Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@ void tensorrt_llm::batch_manager::AssignReqSeqSlots::operator()(SequenceSlotMana`
`37`	`37`	`llmReq->setFirstScheduledTime();`
`38`	`38`	`}`
`39`	`39`	`auto const reqSeqSlot = seqSlotManager.getSequenceSlot(isReqNew, llmReq->mRequestId);`
`40`		`- TLLM_CHECK_WITH_INFO(reqSeqSlot, "Unable to get batch slot for reqId");`
	`40`	`+ TLLM_CHECK_WITH_INFO(reqSeqSlot, "Unable to get batch slot for request ID %lu", llmReq->mRequestId);`
`41`	`41`	`llmReq->mSeqSlot = reqSeqSlot;`
`42`	`42`	`}`
`43`	`43`	`}`
Original file line number	Diff line number	Diff line change
`@@ -92,29 +92,20 @@ namespace`
`92`	`92`	`std::pair<std::vector<SizeType32>, std::vector<SizeType32>> getActiveSlots(`
`93`	`93`	`RequestVector const& contextRequests, RequestVector const& generationRequests)`
`94`	`94`	`{`
`95`		`- std::vector<std::pair<SizeType32, SizeType32>> slots;`
	`95`	`+ std::vector<SizeType32> activeSlots;`
	`96`	`+ std::vector<SizeType32> generationSteps;`
`96`	`97`	`for (auto const& requests : {contextRequests, generationRequests})`
`97`	`98`	`{`
`98`	`99`	`for (auto const& llmReq : requests)`
`99`	`100`	`{`
`100`	`101`	`if (llmReq->isGenerationInProgressState() \|\| llmReq->isLastContextChunk())`
`101`	`102`	`{`
`102`		`- slots.push_back({llmReq->mSeqSlot.value(), llmReq->getDecodingIter()});`
	`103`	`+ activeSlots.push_back(llmReq->mSeqSlot.value());`
	`104`	`+ generationSteps.push_back(llmReq->getDecodingIter());`
`103`	`105`	`}`
`104`	`106`	`}`
`105`	`107`	`}`
`106`	`108`
`107`		`- std::sort(slots.begin(), slots.end(),`
`108`		`- [](std::pair<SizeType32, SizeType32> const& a, std::pair<SizeType32, SizeType32> const& b)`
`109`		`- { return a.first < b.first; });`
`110`		`-`
`111`		`- std::vector<SizeType32> activeSlots, generationSteps;`
`112`		`- for (auto const& slot : slots)`
`113`		`- {`
`114`		`- activeSlots.push_back(slot.first);`
`115`		`- generationSteps.push_back(slot.second);`
`116`		`- }`
`117`		`-`
`118`	`109`	`return {activeSlots, generationSteps};`
`119`	`110`	`}`
`120`	`111`
Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,7 @@ std::optional<SequenceSlotManager::SlotIdType> SequenceSlotManager::getSequenceS`
`61`	`61`	`auto const it = mSequenceIdToSlot.find(sequenceId);`
`62`	`62`	`if (it == mSequenceIdToSlot.end())`
`63`	`63`	`{`
`64`		`- TLLM_LOG_ERROR("Could not find sequence id in allocated sequence slots");`
	`64`	`+ TLLM_LOG_ERROR("Could not find sequence id %lu in allocated sequence slots", sequenceId);`
`65`	`65`	`}`
`66`	`66`	`else`
`67`	`67`	`{`