@@ -89,14 +89,15 @@ void DecoderState::setupBuffers(nvinfer1::DataType dtype, BufferManager const& b
8989
9090 dOutput->lengths = bufferManager.emptyTensor (MemoryType::kGPU , nvSizeType);
9191
92- // use batchSize many entries instead of the usual 1
9392 dOutput->finishedSum = bufferManager.emptyTensor (MemoryType::kGPU , nvSizeType);
9493 // we don't need dOutput->lengths because lengths are passed from outside
9594 dOutput->cumLogProbs = bufferManager.emptyTensor (MemoryType::kGPU , nvFloatType);
9695 dOutput->logProbs = bufferManager.emptyTensor (MemoryType::kGPU , nvFloatType);
9796 dOutput->beamHypotheses .empty (bufferManager);
97+
9898 dOutput->finishReasons
9999 = bufferManager.emptyTensor (MemoryType::kGPU , TRTDataType<tk::FinishedState::UnderlyingType>::value);
100+ dInput->finishReasons = dOutput->finishReasons ;
100101
101102 dOutput->logProbsTiled = bufferManager.emptyTensor (MemoryType::kGPU , nvFloatType);
102103
@@ -106,8 +107,6 @@ void DecoderState::setupBuffers(nvinfer1::DataType dtype, BufferManager const& b
106107 dInput->badWordsLens = bufferManager.emptyTensor (MemoryType::kPINNEDPOOL , nvSizeType);
107108 dInput->embeddingBias = bufferManager.emptyTensor (MemoryType::kGPU , dtype);
108109
109- mFinishedSteps = bufferManager.emptyTensor (MemoryType::kGPU , TRTDataType<tk::FinishedState::UnderlyingType>::value);
110-
111110 mBeamSearchBuffers = std::make_unique<BeamSearchBuffers>(bufferManager);
112111
113112 setupCacheIndirectionBuffers (bufferManager);
@@ -245,10 +244,6 @@ void DecoderState::reshapeBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWid
245244 auto & dOutput = *mJointDecodingOutput ;
246245 dOutput.ids ->reshape (maxTotalTokensShape);
247246
248- auto const maxNewTokensShape = ITensor::makeShape ({mMaxDecodingEngineTokens , mMaxBatchSize , mMaxBeamWidth });
249- mFinishedSteps ->reshape (maxNewTokensShape);
250- bufferManager.setZero (*mFinishedSteps );
251-
252247 dOutput.finishReasons ->reshape (maxBatchSizeXmaxBeamWidthShape);
253248 bufferManager.setZero (*dOutput.finishReasons );
254249
@@ -260,6 +255,7 @@ void DecoderState::reshapeBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWid
260255 dOutput.finishedSum ->reshape (maxBatchSizeShape);
261256 bufferManager.setZero (*dOutput.finishedSum );
262257
258+ auto const maxNewTokensShape = ITensor::makeShape ({mMaxDecodingEngineTokens , mMaxBatchSize , mMaxBeamWidth });
263259 dOutput.newTokensSteps ->reshape (maxNewTokensShape);
264260 bufferManager.setZero (*dOutput.newTokensSteps );
265261
@@ -342,8 +338,6 @@ void DecoderState::reshapeSpeculativeDecodingBuffers(SpeculativeDecodingMode con
342338 mMaxDecodingEngineTokens );
343339
344340 auto const maxNewTokensShape = ITensor::makeShape ({mMaxDecodingEngineTokens , mMaxBatchSize , mMaxBeamWidth });
345- mFinishedSteps ->reshape (maxNewTokensShape);
346- bufferManager.setZero (*mFinishedSteps );
347341 dOutput.newTokensSteps ->reshape (maxNewTokensShape);
348342 bufferManager.setZero (*dOutput.newTokensSteps );
349343
@@ -454,7 +448,6 @@ void DecoderState::disableLookahead(RequestVector const& genRequests)
454448
455449 auto const maxNewTokensShape = ITensor::makeShape ({mMaxDecodingEngineTokens , mMaxBatchSize , mMaxBeamWidth });
456450 mJointDecodingOutput ->newTokensSteps ->reshape (maxNewTokensShape);
457- mFinishedSteps ->reshape (maxNewTokensShape);
458451
459452 for (auto const & llmReq : genRequests)
460453 {
@@ -562,11 +555,6 @@ TensorPtr DecoderState::getAcceptedPackedPaths() const
562555 return mJointDecodingOutput ->speculativeDecodingOutputs ->pathsOffsets ;
563556}
564557
565- TensorPtr DecoderState::getFinishedSteps () const
566- {
567- return mFinishedSteps ;
568- }
569-
570558SizeType32 DecoderState::getMaxBatchSize () const
571559{
572560 return mMaxBatchSize ;
0 commit comments