Skip to content

Commit c3143c4

Browse files
authored
Merge branch 'main' into clean_cuda_graph
2 parents 2cf1852 + 25389c9 commit c3143c4

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+2319
-550
lines changed

cpp/include/tensorrt_llm/batch_manager/llmRequest.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ enum class LlmRequestState : int32_t
6363
kDISAGG_CONTEXT_TRANS_IN_PROGRESS = 21, ///< Waiting context-only request transmitting the kv cache,
6464
/// after computation finished
6565
kDISAGG_CONTEXT_COMPLETE = 22, ///< Context-only request finished kv cache transmission.
66+
67+
// error states
68+
kDISAGG_TRANS_ERROR = -1, ///< Error occurred during kv cache transmission
6669
};
6770

6871
enum LlmRequestType

cpp/include/tensorrt_llm/common/tllmException.h

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020

2121
#include <array>
2222
#include <cstddef>
23+
#include <cstdint>
24+
#include <limits>
2325
#include <stdexcept>
2426
#include <string>
2527

@@ -35,9 +37,26 @@
3537
#define NEW_TLLM_EXCEPTION(...) \
3638
tensorrt_llm::common::TllmException(__FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__).c_str())
3739

40+
#define TLLM_REQUEST_EXCEPTION(requestID, errorCode, ...) \
41+
tensorrt_llm::common::RequestSpecificException( \
42+
__FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__).c_str(), requestID, errorCode)
43+
3844
namespace tensorrt_llm::common
3945
{
4046

47+
/// @brief Enumeration of different error codes for request-specific exceptions
48+
enum class RequestErrorCode : uint32_t
49+
{
50+
// General errors (0-999)
51+
kUNKNOWN_ERROR = 0,
52+
53+
// Network and communication errors (1000-1999)
54+
kNETWORK_ERROR = 1000,
55+
};
56+
57+
/// @brief Constant for unknown request ID
58+
static constexpr uint64_t kUNKNOWN_REQUEST_ID = std::numeric_limits<uint64_t>::max();
59+
4160
class TllmException : public std::runtime_error
4261
{
4362
public:
@@ -66,4 +85,21 @@ class TllmException : public std::runtime_error
6685
throw TllmException(file, line, fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info.c_str()).c_str());
6786
}
6887

88+
class RequestSpecificException : public std::runtime_error
89+
{
90+
public:
91+
explicit RequestSpecificException(
92+
std::string const& file, std::size_t line, char const* msg, uint64_t requestID, RequestErrorCode errorCode);
93+
94+
~RequestSpecificException() noexcept override;
95+
96+
[[nodiscard]] uint64_t getRequestId() const noexcept;
97+
98+
[[nodiscard]] RequestErrorCode getErrorCode() const noexcept;
99+
100+
private:
101+
uint64_t mRequestID;
102+
RequestErrorCode mErrorCode;
103+
};
104+
69105
} // namespace tensorrt_llm::common

cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -427,8 +427,17 @@ void CacheTransceiver::checkContextTransferStatus(std::optional<int> const& atLe
427427
auto& [request, future] = *it;
428428
if (blockAll || (toCompleteIdSet.find(request->mRequestId) != toCompleteIdSet.end()))
429429
{
430-
future.get();
431-
request->setState(LlmRequestState::kDISAGG_CONTEXT_COMPLETE);
430+
try
431+
{
432+
future.get();
433+
request->setState(LlmRequestState::kDISAGG_CONTEXT_COMPLETE);
434+
}
435+
catch (std::exception const& e)
436+
{
437+
TLLM_LOG_ERROR(
438+
"Error occurred during context transfer for request %ld: %s", request->mRequestId, e.what());
439+
request->setState(LlmRequestState::kDISAGG_TRANS_ERROR);
440+
}
432441
it = mResponderFutures.erase(it);
433442
}
434443
else
@@ -521,19 +530,28 @@ void CacheTransceiver::checkGenTransferStatus(std::optional<int> const& atLeastR
521530
{
522531
if (blockAll || toCompleteIdSet.find(it->first->mRequestId) != toCompleteIdSet.end())
523532
{
524-
it->second.get();
525-
526-
// Gather the kv cache transfer time from all workers and update to leader rank
527-
if (!common::getEnvKVCacheTransferOutputPath().empty())
533+
try
534+
{
535+
it->second.get();
536+
it->first->setState(LlmRequestState::kDISAGG_GENERATION_TRANS_COMPLETE);
537+
538+
// Gather the kv cache transfer time from all workers and update to leader rank
539+
if (!common::getEnvKVCacheTransferOutputPath().empty())
540+
{
541+
auto syncComm
542+
= mCacheState->getParallelConfig().mEnableAttentionDP ? mMpiGroupDataComm.get() : mMpiGroupComm;
543+
updateKVCacheTransferBW(*syncComm, it->first);
544+
}
545+
}
546+
catch (std::exception const& e)
528547
{
529-
auto syncComm
530-
= mCacheState->getParallelConfig().mEnableAttentionDP ? mMpiGroupDataComm.get() : mMpiGroupComm;
531-
updateKVCacheTransferBW(*syncComm, it->first);
548+
TLLM_LOG_ERROR(
549+
"Error occurred during generation transfer for request %ld: %s", it->first->mRequestId, e.what());
550+
it->first->setState(LlmRequestState::kDISAGG_TRANS_ERROR);
532551
}
533552
TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(),
534553
"**** it->first->mRequestId: %ld, context request ID: %ld ******** get feature ***",
535554
it->first->mRequestId, it->first->getContextPhaseParams().value().getReqId());
536-
it->first->setState(LlmRequestState::kDISAGG_GENERATION_TRANS_COMPLETE);
537555
it = mRequesterFutures.erase(it);
538556
}
539557
else

cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "tensorrt_llm/batch_manager/runtimeBuffers.h"
2323
#include "tensorrt_llm/common/envUtils.h"
2424
#include "tensorrt_llm/common/logger.h"
25+
#include "tensorrt_llm/common/tllmException.h"
2526
#include "tensorrt_llm/common/utils.h"
2627
#include "tensorrt_llm/runtime/utils/mpiUtils.h"
2728
#include <future>
@@ -190,6 +191,12 @@ class DataResponder::Impl
190191
mSender->release(id);
191192
resp.mPromise.set_value();
192193
}
194+
catch (tensorrt_llm::common::RequestSpecificException const& e)
195+
{
196+
TLLM_LOG_ERROR("Exception in sendAndRemoveResponse: %s ", e.what());
197+
auto new_exception = TLLM_REQUEST_EXCEPTION(id, e.getErrorCode(), "%s", e.what());
198+
resp.mPromise.set_exception(std::make_exception_ptr(new_exception));
199+
}
193200
catch (std::exception const& e)
194201
{
195202
TLLM_LOG_ERROR("Exception in sendAndRemoveResponse: %s ", e.what());
@@ -496,6 +503,15 @@ class DataRequester::Impl
496503
requestSync(*requestAndPromise.mRequest);
497504
requestAndPromise.mPromise->set_value();
498505
}
506+
catch (tensorrt_llm::common::RequestSpecificException const& err)
507+
{
508+
TLLM_LOG_ERROR("Exception in DataRequester request(): request id:%zu , request context id:%zu : %s",
509+
requestAndPromise.mRequest->mRequestId,
510+
requestAndPromise.mRequest->getContextPhaseParams().value().getReqId(), err.what());
511+
auto new_exception = TLLM_REQUEST_EXCEPTION(
512+
requestAndPromise.mRequest->mRequestId, err.getErrorCode(), "%s", err.what());
513+
requestAndPromise.mPromise->set_exception(std::make_exception_ptr(new_exception));
514+
}
499515
catch (std::exception const& err)
500516
{
501517
TLLM_LOG_ERROR("Exception in DataRequester request(): request id:%ld , request context id:%ld : %s",

cpp/tensorrt_llm/batch_manager/dataTransceiver.h

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,12 +151,28 @@ class TransferSession
151151

152152
void send(size_t idx, void const* data, size_t size)
153153
{
154-
mConnections.at(idx)->send(mDataContext, data, size);
154+
try
155+
{
156+
mConnections.at(idx)->send(mDataContext, data, size);
157+
}
158+
catch (std::exception const& e)
159+
{
160+
throw common::RequestSpecificException(
161+
__FILE__, __LINE__, e.what(), mRequest->mRequestId, common::RequestErrorCode::kNETWORK_ERROR);
162+
}
155163
}
156164

157165
void recv(size_t idx, void* data, size_t size)
158166
{
159-
mConnections.at(idx)->recv(mDataContext, data, size);
167+
try
168+
{
169+
mConnections.at(idx)->recv(mDataContext, data, size);
170+
}
171+
catch (std::exception const& e)
172+
{
173+
throw common::RequestSpecificException(
174+
__FILE__, __LINE__, e.what(), mRequest->mRequestId, common::RequestErrorCode::kNETWORK_ERROR);
175+
}
160176
}
161177

162178
[[nodiscard]] LlmRequest const& getLlmRequest() const

cpp/tensorrt_llm/common/tllmException.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "tensorrt_llm/common/tllmException.h"
1818
#include "tensorrt_llm/common/stringUtils.h"
1919

20+
#include <cinttypes>
2021
#include <cstdlib>
2122
#if !defined(_MSC_VER)
2223
#include <cxxabi.h>
@@ -106,4 +107,25 @@ std::string TllmException::demangle(char const* name)
106107
#endif
107108
}
108109

110+
RequestSpecificException::RequestSpecificException(
111+
std::string const& file, std::size_t line, char const* msg, uint64_t requestID, RequestErrorCode errorCode)
112+
: std::runtime_error{fmtstr("%s (Request ID: %" PRIu64 ", Error Code: %u) (%s:%zu)", msg, requestID,
113+
static_cast<uint32_t>(errorCode), file.c_str(), line)}
114+
, mRequestID{requestID}
115+
, mErrorCode{errorCode}
116+
{
117+
}
118+
119+
RequestSpecificException::~RequestSpecificException() noexcept = default;
120+
121+
uint64_t RequestSpecificException::getRequestId() const noexcept
122+
{
123+
return mRequestID;
124+
}
125+
126+
RequestErrorCode RequestSpecificException::getErrorCode() const noexcept
127+
{
128+
return mErrorCode;
129+
}
130+
109131
} // namespace tensorrt_llm::common

cpp/tensorrt_llm/executor/cache_transmission/ucx_utils/connection.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,16 @@
2020

2121
#include "tensorrt_llm/batch_manager/dataTransceiverImpl.h"
2222
#include "tensorrt_llm/common/cudaUtils.h"
23+
#include "tensorrt_llm/common/tllmException.h"
2324
#include "tensorrt_llm/executor/cache_transmission/ucx_utils/connection.h"
2425

2526
namespace tensorrt_llm::executor::kv_cache
2627
{
2728

29+
// Using declarations to shorten the code
30+
using RequestSpecificException = tensorrt_llm::common::RequestSpecificException;
31+
using RequestErrorCode = tensorrt_llm::common::RequestErrorCode;
32+
2833
UcxConnection::UcxConnection(ConnectionIdType connectionId, std::shared_ptr<ucxx::Endpoint> endpoint,
2934
UcxConnectionManager* manager, bool fromRequester)
3035
: mConnectionId(connectionId)

cpp/tensorrt_llm/nanobind/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ set(SRCS
1010
batch_manager/kvCacheConnector.cpp
1111
batch_manager/kvCacheManager.cpp
1212
batch_manager/llmRequest.cpp
13+
common/tllmExceptions.cpp
1314
executor/bindings.cpp
1415
executor/executor.cpp
1516
executor/executorConfig.cpp

cpp/tensorrt_llm/nanobind/bindings.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include "tensorrt_llm/nanobind/batch_manager/kvCacheConnector.h"
3838
#include "tensorrt_llm/nanobind/batch_manager/kvCacheManager.h"
3939
#include "tensorrt_llm/nanobind/batch_manager/llmRequest.h"
40+
#include "tensorrt_llm/nanobind/common/tllmExceptions.h"
4041
#include "tensorrt_llm/nanobind/executor/bindings.h"
4142
#include "tensorrt_llm/nanobind/runtime/bindings.h"
4243
#include "tensorrt_llm/nanobind/testing/modelSpecBinding.h"
@@ -127,9 +128,11 @@ NB_MODULE(TRTLLM_NB_MODULE, m)
127128
auto mInternalTesting = mInternal.def_submodule("testing", "Testing internal bindings");
128129
auto mInternalBatchManager = mInternal.def_submodule("batch_manager", "Batch manager internal bindings");
129130
auto mInternalThop = mInternal.def_submodule("thop", "Torch op internal bindings");
131+
auto mExceptions = m.def_submodule("exceptions", "Exceptions internal bindings");
130132

131133
tensorrt_llm::nanobind::executor::initBindings(mExecutor);
132134
tensorrt_llm::nanobind::runtime::initBindingsEarly(mInternalRuntime);
135+
tensorrt_llm::nanobind::common::initExceptionsBindings(mExceptions);
133136
tensorrt_llm::nanobind::thop::initBindings(mInternalThop);
134137

135138
auto buildInfo = m.def_submodule("BuildInfo");
@@ -471,7 +474,8 @@ NB_MODULE(TRTLLM_NB_MODULE, m)
471474
.value("DISAGG_CONTEXT_COMPLETE", tb::LlmRequestState::kDISAGG_CONTEXT_COMPLETE)
472475
.value("DISAGG_GENERATION_TRANS_IN_PROGRESS", tb::LlmRequestState::kDISAGG_GENERATION_TRANS_IN_PROGRESS)
473476
.value("DISAGG_GENERATION_TRANS_COMPLETE", tb::LlmRequestState::kDISAGG_GENERATION_TRANS_COMPLETE)
474-
.value("DISAGG_CONTEXT_INIT_AND_TRANS", tb::LlmRequestState::kDISAGG_CONTEXT_INIT_AND_TRANS);
477+
.value("DISAGG_CONTEXT_INIT_AND_TRANS", tb::LlmRequestState::kDISAGG_CONTEXT_INIT_AND_TRANS)
478+
.value("DISAGG_TRANS_ERROR", tb::LlmRequestState::kDISAGG_TRANS_ERROR);
475479

476480
nb::class_<tr::MemoryCounters>(m, "MemoryCounters")
477481
.def_static("instance", &tr::MemoryCounters::getInstance, nb::rv_policy::reference)
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
/*
2+
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include "tllmExceptions.h"
18+
#include "tensorrt_llm/common/tllmException.h"
19+
#include <nanobind/nanobind.h>
20+
21+
namespace tc = tensorrt_llm::common;
22+
namespace nb = nanobind;
23+
24+
namespace tensorrt_llm::nanobind::common
25+
{
26+
27+
void initExceptionsBindings(nb::module_& m)
28+
{
29+
// Bind the RequestErrorCode enum
30+
nb::enum_<tc::RequestErrorCode>(m, "RequestErrorCode")
31+
.value("UNKNOWN_ERROR", tc::RequestErrorCode::kUNKNOWN_ERROR)
32+
.value("NETWORK_ERROR", tc::RequestErrorCode::kNETWORK_ERROR)
33+
.export_values();
34+
35+
// Create the RequestSpecificException Python exception class
36+
static nb::object request_specific_exc = nb::exception<tc::RequestSpecificException>(m, "RequestSpecificException");
37+
38+
// Add attributes to the Python exception class
39+
request_specific_exc.attr("request_id") = nb::none();
40+
request_specific_exc.attr("error_code") = nb::none();
41+
42+
// Register exception translator to convert C++ exceptions to Python
43+
nb::register_exception_translator(
44+
[](std::exception_ptr const& p, void*)
45+
{
46+
try
47+
{
48+
if (p)
49+
std::rethrow_exception(p);
50+
}
51+
catch (const tc::RequestSpecificException& e)
52+
{
53+
// Create a Python exception with the request ID and error code information
54+
nb::object py_exc = nb::cast(e);
55+
nb::object request_id = nb::cast(e.getRequestId());
56+
nb::object error_code = nb::cast(static_cast<uint32_t>(e.getErrorCode()));
57+
58+
// Set additional attributes on the exception
59+
py_exc.attr("request_id") = request_id;
60+
py_exc.attr("error_code") = error_code;
61+
62+
PyErr_SetObject(request_specific_exc.ptr(), py_exc.ptr());
63+
}
64+
});
65+
}
66+
67+
} // namespace tensorrt_llm::nanobind::common

0 commit comments

Comments
 (0)