Skip to content

Commit

Permalink
Update TensorRT-LLM (#2363)
Browse files Browse the repository at this point in the history
* Update TensorRT-LLM

---------

Co-authored-by: tonylek <[email protected]>
  • Loading branch information
kaiyux and tonylek authored Oct 22, 2024
1 parent 75057cd commit 1730a58
Show file tree
Hide file tree
Showing 291 changed files with 1,113,570 additions and 822,107 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,5 @@ cpp/include/tensorrt_llm/executor/version.h

# User config files
CMakeUserPresets.json
compile_commands.json
*.bin
15 changes: 9 additions & 6 deletions benchmarks/python/enc_dec_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,8 @@ def set_weight_streaming(self, config):
self.decoder_session.runtime._set_weight_streaming(gpu_weights_percent)

def prepare_inputs(self, config):
batch_size, encoder_input_len = config[0], config[1]
batch_size, encoder_input_len, output_len = config[0], config[
1], config[2]
attention_mask = None
whisper_decoder_encoder_input_lengths = None
outputs = {}
Expand Down Expand Up @@ -271,7 +272,8 @@ def prepare_inputs(self, config):
dtype=torch.int32,
device='cuda')
cross_attention_mask = torch.ones([
outputs['encoder_output'].shape[0], 1,
outputs['encoder_output'].shape[0],
decoder_input_lengths.max() + output_len,
outputs['encoder_output'].shape[1]
]).int().cuda()
else:
Expand All @@ -297,8 +299,11 @@ def prepare_inputs(self, config):
(batch_size, encoder_input_len)).int().cuda()
# cross attention mask, always set 1 as if all are valid tokens
# [batch_size, query_len, encoder_input_len] currently, use query_len=1
cross_attention_mask = torch.ones(
(batch_size, 1, encoder_input_len)).int().cuda()
cross_attention_mask = [
torch.ones(decoder_input_lengths.max() + output_len,
encoder_input_len).int().cuda()
for _ in range(batch_size)
]

hidden_size = (self.encoder_model_config.hidden_size *
self.world_size) # tp_size
Expand Down Expand Up @@ -396,8 +401,6 @@ def run(self, inputs, config, benchmark_profiler=None):
encoder_max_input_length=encoder_max_input_length,
)

cross_attention_mask = None if self.decoder_model_config.gpt_attention_plugin else cross_attention_mask

self.decoder_session.decode(
decoder_input_ids,
decoder_input_lengths,
Expand Down
2 changes: 1 addition & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -543,7 +543,7 @@ endif()

# Defer UCX/UCXX setup until after USE_CXX11_ABI is well defined, as UCXX will
# need to be built to have aligned symbols
set_ifndef(ENABLE_UCX 0)
set_ifndef(ENABLE_UCX 1)
if(ENABLE_UCX)
# Only enable UCX related features if the system has UCX library
find_package(ucx)
Expand Down
102 changes: 78 additions & 24 deletions cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ class GenericLlmRequest
executor::PriorityType priority = executor::Request::kDefaultPriority,
std::optional<TensorPtr> encoderInputFeatures = std::nullopt,
std::optional<SizeType32> encoderOutputLength = std::nullopt,
std::optional<TensorPtr> crossAttentionMask = std::nullopt,
LlmRequestType llmRequestType = LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION,
std::optional<std::shared_ptr<VecTokenExtraIds>> inputTokenExtraIds = std::nullopt,
SizeType32 numReturnSequences = 1)
Expand Down Expand Up @@ -150,6 +151,7 @@ class GenericLlmRequest
, mFinishReasons(samplingConfig.beamWidth)
, mEncoderInputFeatures(std::move(encoderInputFeatures))
, mEncoderOutputLength(encoderOutputLength)
, mCrossAttentionMask(std::move(crossAttentionMask))
, mLlmRequestType(llmRequestType)
, mInputTokenExtraIds(std::move(inputTokenExtraIds))
, mNumReturnSequences(numReturnSequences)
Expand Down Expand Up @@ -205,7 +207,7 @@ class GenericLlmRequest
, mEncoderOutputLength(req.getEncoderOutputLength())
, mContextPhaseParams(req.getContextPhaseParams())
, mInputTokenExtraIds(std::nullopt)
, mNumReturnSequences(req.getNumReturnSequences())
, mNumReturnSequences(1)
, mSequenceIndex(0)
{
if (req.getRequestType() == executor::RequestType::REQUEST_TYPE_GENERATION_ONLY)
Expand Down Expand Up @@ -243,7 +245,8 @@ class GenericLlmRequest

if (req.getEmbeddingBias())
{
mEmbeddingBias = executor::detail::toITensor(req.getEmbeddingBias().value());
mEmbeddingBias
= tensorrt_llm::runtime::ITensor::view(executor::detail::toITensor(req.getEmbeddingBias().value()));
// Add leading 1 dimension since that's what IFB code expects
mEmbeddingBias.value()->unsqueeze(0);
}
Expand Down Expand Up @@ -324,6 +327,16 @@ class GenericLlmRequest
mEncoderInputFeatures = std::nullopt;
}

auto const& crossAttentionMask = req.getCrossAttentionMask();
if (crossAttentionMask.has_value())
{
mCrossAttentionMask = executor::detail::toITensor(crossAttentionMask.value());
}
else
{
mCrossAttentionMask = std::nullopt;
}

switch (req.getRequestType())
{
case executor::RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION:
Expand Down Expand Up @@ -393,15 +406,6 @@ class GenericLlmRequest
mMaxNewTokens = maxNewTokens;
}

if (mNumReturnSequences > 1 && mSamplingConfig.beamWidth > 1)
{
TLLM_THROW(
"Using mNumReturnSequences (%d) > 1 with beam search is currently disabled, since TensorRT-LLM returns "
"a total of mNumReturnSequences x beamWidth beams, rather than limiting the number of returned beams "
"to mNumReturnSequences. This restriction will be removed once the issue is resolved.",
mNumReturnSequences);
}

TLLM_CHECK_WITH_INFO(mSamplingConfig.validate(), "Incorrect sampling config");

// validate extra ids when enabling kv cache reuse with prompt table
Expand Down Expand Up @@ -452,9 +456,20 @@ class GenericLlmRequest
/// @return The number of sequences to return.
[[nodiscard]] SizeType32 getNumReturnSequences() const
{
TLLM_LOG_WARNING(
"mNumReturnSequences in the LlmRequest class is deprecated. Please use numReturnSequences in "
"SamplingConfig directly.");
return mNumReturnSequences;
}

/// @brief Get the number of subrequests, the expected number of responses under non-streaming mode. In sampling
/// mode, it will be equal to mSamplingConfig.numReturnSequences, while it will be equal to 1 in beam search.
/// @return The number of subrequests in total request size.
[[nodiscard]] SizeType32 getNumSubRequests() const
{
return mSamplingConfig.beamWidth == 1 ? mSamplingConfig.numReturnSequences.value_or(1) : 1;
}

/// @brief Get child requests spawned by this req.
/// @return A vector of child requests.
[[nodiscard]] std::vector<RequestPtr> const& getChildRequests() const
Expand Down Expand Up @@ -661,8 +676,8 @@ class GenericLlmRequest
TLLM_CHECK_WITH_INFO(mChildRequests.size() <= static_cast<size_t>(numReturnSequences),
"Cannot set numReturnSequences %d smaller than the number %ld of child requests that have already created.",
numReturnSequences, mChildRequests.size());
mNumReturnSequences = numReturnSequences;
mSequenceFinalVec->resize(mNumReturnSequences);
mSamplingConfig.numReturnSequences = numReturnSequences;
mSequenceFinalVec->resize(numReturnSequences);
}

[[nodiscard]] bool constexpr isChild() const noexcept
Expand Down Expand Up @@ -1021,6 +1036,11 @@ class GenericLlmRequest
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
}

[[nodiscard]] TensorPtr const getCrossAttentionMask() const
{
return mCrossAttentionMask.value_or(nullptr);
}

[[nodiscard]] bool constexpr isStreaming() const noexcept
{
return mIsStreaming;
Expand Down Expand Up @@ -1267,6 +1287,12 @@ class GenericLlmRequest
return mPriority;
}

/// Get the counter of decoding iterations.
SizeType32 getDecodingIter()
{
return mDecodingIter;
}

/// Increment the counter of decoding iterations.
void advanceDecodingIter()
{
Expand Down Expand Up @@ -1307,7 +1333,6 @@ class GenericLlmRequest
result.isFinal = std::all_of(mSequenceFinalVec->begin(), mSequenceFinalVec->end(),
[](bool isSequenceFinal) { return isSequenceFinal; });

auto const nbBeams = mSamplingConfig.beamWidth;
auto const maxNbTokens = getMaxBeamNumTokens();

if (isDisaggContextTransmissionState() && isContextOnlyRequest())
Expand Down Expand Up @@ -1335,6 +1360,8 @@ class GenericLlmRequest

auto const maxNbTokensOut = calculateNbTokensOut(maxNbTokens);

auto const nbBeams = mSamplingConfig.getNumReturnBeams();

result.outputTokenIds.resize(nbBeams);

auto const startTokenPos = maxNbTokens - maxNbTokensOut;
Expand All @@ -1359,10 +1386,13 @@ class GenericLlmRequest
}
}

auto sliceBeams = [&nbBeams](auto beams)
{ return std::vector<typename decltype(beams)::value_type>(beams.begin(), beams.begin() + nbBeams); };

if (returnLogProbs())
{
result.cumLogProbs = getCumLogProbs();
result.logProbs = getLogProbs();
result.cumLogProbs = sliceBeams(getCumLogProbs());
result.logProbs = sliceBeams(getLogProbs());
}

if (getReturnContextLogits())
Expand All @@ -1372,7 +1402,8 @@ class GenericLlmRequest

if (getReturnGenerationLogits())
{
if (isStreaming())
bool hasDraftTokens = (mDraftTokens && mDraftTokens->size() > 0) ? true : false;
if (isStreaming() && !hasDraftTokens)
{
auto startGenTokenPos = startTokenPos - getOrigPromptLen();
TensorPtr generationLogitsHostCurrentStep
Expand All @@ -1386,7 +1417,8 @@ class GenericLlmRequest
}
else
{
result.generationLogits = executor::detail::ofITensor(getGenerationLogitsHost());
result.generationLogits = executor::detail::ofITensor(
runtime::ITensor::slice(getGenerationLogitsHost(), 0, nbBeams));
}
}

Expand All @@ -1395,7 +1427,7 @@ class GenericLlmRequest
result.encoderOutput = executor::detail::ofITensor(getEncoderOutputHost());
}

result.finishReasons = mFinishReasons;
result.finishReasons = sliceBeams(mFinishReasons);
result.decodingIter = mDecodingIter;

// Update position of last sent response
Expand Down Expand Up @@ -1560,6 +1592,7 @@ class GenericLlmRequest
std::optional<SizeType32>
mEncoderOutputLength; // For some models like Whisper, encoder output shape cannot be inferred from encoder
// input shape due to downsampling. Thus this is needed for setting buffer sizes correctly
std::optional<TensorPtr> mCrossAttentionMask; // Input cross attention mask
LlmRequestType mLlmRequestType;
std::optional<executor::ContextPhaseParams> mContextPhaseParams;

Expand Down Expand Up @@ -1644,10 +1677,30 @@ class GenericLlmRequest

setReturnLogProbs(outputLogProbs);

// Handling the backward compatibility of numReturnSequences.
if (mNumReturnSequences > 1)
{
if (!mSamplingConfig.numReturnSequences)
{
TLLM_LOG_WARNING(
"In the Executor class, mNumReturnSequences is deprecated. Please set numReturnSequences in "
"SamplingConfig directly.");
}
else if (mSamplingConfig.numReturnSequences
&& mSamplingConfig.numReturnSequences.value() != mNumReturnSequences)
{
TLLM_THROW(
"In the Executor class, both mSamplingConfig.numReturnSequences (%d) and mNumReturnSequences (%d) "
"are provided but unmatched. Please use numReturnSequences in SamplingConfig directly.",
mSamplingConfig.numReturnSequences.value(), mNumReturnSequences);
}
mSamplingConfig.numReturnSequences = mNumReturnSequences;
}

if (!isChild())
{
// Initialize result states unless it is a child and a child request should share parent's one.
mSequenceFinalVec = std::make_shared<std::vector<bool>>(getNumReturnSequences(), false);
mSequenceFinalVec = std::make_shared<std::vector<bool>>(getNumSubRequests(), false);
}
}

Expand Down Expand Up @@ -1715,6 +1768,7 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
executor::PriorityType priority = executor::Request::kDefaultPriority,
std::optional<TensorPtr> encoderInputFeatures = std::nullopt,
std::optional<SizeType32> encoderOutputLength = std::nullopt,
std::optional<TensorPtr> crossAttentionMask = std::nullopt,
LlmRequestType llmRequestType = LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION,
std::optional<std::shared_ptr<VecTokenExtraIds>> inputTokenExtraIds = std::nullopt,
SizeType32 numReturnSequences = 1)
Expand All @@ -1724,8 +1778,8 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
std::move(lookaheadConfig), returnLogProbs, returnContextLogits, returnGenerationLogits,
std::move(draftTokens), std::move(draftLogits), excludeInputFromOutput, std::move(logitsPostProcessor),
applyLogitsPostProcessorBatched, std::move(encoderInputTokens), returnEncoderOutput, clientId, priority,
std::move(encoderInputFeatures), std::move(encoderOutputLength), llmRequestType,
std::move(inputTokenExtraIds), numReturnSequences)
std::move(encoderInputFeatures), std::move(encoderOutputLength), std::move(crossAttentionMask),
llmRequestType, std::move(inputTokenExtraIds), numReturnSequences)
{
}

Expand All @@ -1742,8 +1796,8 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
std::shared_ptr<LlmRequest> createChildRequest(RequestIdType requestId)
{
TLLM_CHECK_WITH_INFO(!isChild(), "A child request cannot create its own child.");
TLLM_CHECK_WITH_INFO(mChildRequests.size() + 1 < static_cast<size_t>(getNumReturnSequences()),
"Cannot create child requests more than the number of return sequences (%d)", getNumReturnSequences());
TLLM_CHECK_WITH_INFO(mChildRequests.size() + 1 < static_cast<size_t>(getNumSubRequests()),
"Cannot create child requests more than the number of return sequences (%d)", getNumSubRequests());
auto childReq = std::make_shared<LlmRequest>(*this);
childReq->mRequestId = requestId;
childReq->mSequenceIndex = mChildRequests.size() + 1;
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/tensorrt_llm/common/cudaUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,8 @@ inline void syncAndCheck(char const* const file, int const line)
{
if (doCheckError())
{
cudaDeviceSynchronize();
check(cudaGetLastError(), "cudaGetLastError", file, line);
check(cudaDeviceSynchronize(), "cudaDeviceSynchronize", file, line);
}
}

Expand Down
1 change: 1 addition & 0 deletions cpp/include/tensorrt_llm/common/mpiUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <cuda_bf16.h>
#endif

#include <cstdint>
#include <cstdlib>
#include <memory>

Expand Down
Loading

0 comments on commit 1730a58

Please sign in to comment.