QiJune
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h‎
Lines changed: 3 additions & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/common/tllmException.h‎
Lines changed: 36 additions & 0 deletions b/‎cpp/include/tensorrt_llm/common/tllmException.h‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp‎
Lines changed: 28 additions & 10 deletions b/‎cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp‎
Lines changed: 28 additions & 10 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp‎
Lines changed: 16 additions & 0 deletions b/‎cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/dataTransceiver.h‎
Lines changed: 18 additions & 2 deletions b/‎cpp/tensorrt_llm/batch_manager/dataTransceiver.h‎
Lines changed: 18 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/common/tllmException.cpp‎
Lines changed: 22 additions & 0 deletions b/‎cpp/tensorrt_llm/common/tllmException.cpp‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/executor/cache_transmission/ucx_utils/connection.cpp‎
Lines changed: 5 additions & 0 deletions b/‎cpp/tensorrt_llm/executor/cache_transmission/ucx_utils/connection.cpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎cpp/tensorrt_llm/nanobind/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/bindings.cpp‎
Lines changed: 5 additions & 1 deletion b/‎cpp/tensorrt_llm/nanobind/bindings.cpp‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎cpp/tensorrt_llm/nanobind/common/tllmExceptions.cpp‎
Lines changed: 67 additions & 0 deletions b/‎cpp/tensorrt_llm/nanobind/common/tllmExceptions.cpp‎
Lines changed: 67 additions & 0 deletions
@@ -63,6 +63,9 @@ enum class LlmRequestState : int32_t
     kDISAGG_CONTEXT_TRANS_IN_PROGRESS = 21, ///< Waiting context-only request transmitting the kv cache,
                                             /// after computation finished
     kDISAGG_CONTEXT_COMPLETE = 22,          ///< Context-only request finished kv cache transmission.
+
+    // error states
+    kDISAGG_TRANS_ERROR = -1, ///< Error occurred during kv cache transmission
 };
 
 enum LlmRequestType
 
@@ -20,6 +20,8 @@
 
 #include <array>
 #include <cstddef>
+#include <cstdint>
+#include <limits>
 #include <stdexcept>
 #include <string>
 
@@ -35,9 +37,26 @@
 #define NEW_TLLM_EXCEPTION(...)                                                                                        \
     tensorrt_llm::common::TllmException(__FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__).c_str())
 
+#define TLLM_REQUEST_EXCEPTION(requestID, errorCode, ...)                                                              \
+    tensorrt_llm::common::RequestSpecificException(                                                                    \
+        __FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__).c_str(), requestID, errorCode)
+
 namespace tensorrt_llm::common
 {
 
+/// @brief Enumeration of different error codes for request-specific exceptions
+enum class RequestErrorCode : uint32_t
+{
+    // General errors (0-999)
+    kUNKNOWN_ERROR = 0,
+
+    // Network and communication errors (1000-1999)
+    kNETWORK_ERROR = 1000,
+};
+
+/// @brief Constant for unknown request ID
+static constexpr uint64_t kUNKNOWN_REQUEST_ID = std::numeric_limits<uint64_t>::max();
+
 class TllmException : public std::runtime_error
 {
 public:
@@ -66,4 +85,21 @@ class TllmException : public std::runtime_error
     throw TllmException(file, line, fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info.c_str()).c_str());
 }
 
+class RequestSpecificException : public std::runtime_error
+{
+public:
+    explicit RequestSpecificException(
+        std::string const& file, std::size_t line, char const* msg, uint64_t requestID, RequestErrorCode errorCode);
+
+    ~RequestSpecificException() noexcept override;
+
+    [[nodiscard]] uint64_t getRequestId() const noexcept;
+
+    [[nodiscard]] RequestErrorCode getErrorCode() const noexcept;
+
+private:
+    uint64_t mRequestID;
+    RequestErrorCode mErrorCode;
+};
+
 } // namespace tensorrt_llm::common
@@ -427,8 +427,17 @@ void CacheTransceiver::checkContextTransferStatus(std::optional<int> const& atLe
         auto& [request, future] = *it;
         if (blockAll || (toCompleteIdSet.find(request->mRequestId) != toCompleteIdSet.end()))
         {
-            future.get();
-            request->setState(LlmRequestState::kDISAGG_CONTEXT_COMPLETE);
+            try
+            {
+                future.get();
+                request->setState(LlmRequestState::kDISAGG_CONTEXT_COMPLETE);
+            }
+            catch (std::exception const& e)
+            {
+                TLLM_LOG_ERROR(
+                    "Error occurred during context transfer for request %ld: %s", request->mRequestId, e.what());
+                request->setState(LlmRequestState::kDISAGG_TRANS_ERROR);
+            }
             it = mResponderFutures.erase(it);
         }
         else
@@ -521,19 +530,28 @@ void CacheTransceiver::checkGenTransferStatus(std::optional<int> const& atLeastR
     {
         if (blockAll || toCompleteIdSet.find(it->first->mRequestId) != toCompleteIdSet.end())
         {
-            it->second.get();
-
-            // Gather the kv cache transfer time from all workers and update to leader rank
-            if (!common::getEnvKVCacheTransferOutputPath().empty())
+            try
+            {
+                it->second.get();
+                it->first->setState(LlmRequestState::kDISAGG_GENERATION_TRANS_COMPLETE);
+
+                // Gather the kv cache transfer time from all workers and update to leader rank
+                if (!common::getEnvKVCacheTransferOutputPath().empty())
+                {
+                    auto syncComm
+                        = mCacheState->getParallelConfig().mEnableAttentionDP ? mMpiGroupDataComm.get() : mMpiGroupComm;
+                    updateKVCacheTransferBW(*syncComm, it->first);
+                }
+            }
+            catch (std::exception const& e)
             {
-                auto syncComm
-                    = mCacheState->getParallelConfig().mEnableAttentionDP ? mMpiGroupDataComm.get() : mMpiGroupComm;
-                updateKVCacheTransferBW(*syncComm, it->first);
+                TLLM_LOG_ERROR(
+                    "Error occurred during generation transfer for request %ld: %s", it->first->mRequestId, e.what());
+                it->first->setState(LlmRequestState::kDISAGG_TRANS_ERROR);
             }
             TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(),
                 "**** it->first->mRequestId: %ld, context request ID: %ld ******** get feature ***",
                 it->first->mRequestId, it->first->getContextPhaseParams().value().getReqId());
-            it->first->setState(LlmRequestState::kDISAGG_GENERATION_TRANS_COMPLETE);
             it = mRequesterFutures.erase(it);
         }
         else
 
@@ -22,6 +22,7 @@
 #include "tensorrt_llm/batch_manager/runtimeBuffers.h"
 #include "tensorrt_llm/common/envUtils.h"
 #include "tensorrt_llm/common/logger.h"
+#include "tensorrt_llm/common/tllmException.h"
 #include "tensorrt_llm/common/utils.h"
 #include "tensorrt_llm/runtime/utils/mpiUtils.h"
 #include <future>
@@ -190,6 +191,12 @@ class DataResponder::Impl
             mSender->release(id);
             resp.mPromise.set_value();
         }
+        catch (tensorrt_llm::common::RequestSpecificException const& e)
+        {
+            TLLM_LOG_ERROR("Exception in sendAndRemoveResponse: %s ", e.what());
+            auto new_exception = TLLM_REQUEST_EXCEPTION(id, e.getErrorCode(), "%s", e.what());
+            resp.mPromise.set_exception(std::make_exception_ptr(new_exception));
+        }
         catch (std::exception const& e)
         {
             TLLM_LOG_ERROR("Exception in sendAndRemoveResponse: %s ", e.what());
@@ -496,6 +503,15 @@ class DataRequester::Impl
                     requestSync(*requestAndPromise.mRequest);
                     requestAndPromise.mPromise->set_value();
                 }
+                catch (tensorrt_llm::common::RequestSpecificException const& err)
+                {
+                    TLLM_LOG_ERROR("Exception in DataRequester request(): request id:%zu , request context id:%zu : %s",
+                        requestAndPromise.mRequest->mRequestId,
+                        requestAndPromise.mRequest->getContextPhaseParams().value().getReqId(), err.what());
+                    auto new_exception = TLLM_REQUEST_EXCEPTION(
+                        requestAndPromise.mRequest->mRequestId, err.getErrorCode(), "%s", err.what());
+                    requestAndPromise.mPromise->set_exception(std::make_exception_ptr(new_exception));
+                }
                 catch (std::exception const& err)
                 {
                     TLLM_LOG_ERROR("Exception in DataRequester request(): request id:%ld , request context id:%ld : %s",
 
@@ -151,12 +151,28 @@ class TransferSession
 
     void send(size_t idx, void const* data, size_t size)
     {
-        mConnections.at(idx)->send(mDataContext, data, size);
+        try
+        {
+            mConnections.at(idx)->send(mDataContext, data, size);
+        }
+        catch (std::exception const& e)
+        {
+            throw common::RequestSpecificException(
+                __FILE__, __LINE__, e.what(), mRequest->mRequestId, common::RequestErrorCode::kNETWORK_ERROR);
+        }
     }
 
     void recv(size_t idx, void* data, size_t size)
     {
-        mConnections.at(idx)->recv(mDataContext, data, size);
+        try
+        {
+            mConnections.at(idx)->recv(mDataContext, data, size);
+        }
+        catch (std::exception const& e)
+        {
+            throw common::RequestSpecificException(
+                __FILE__, __LINE__, e.what(), mRequest->mRequestId, common::RequestErrorCode::kNETWORK_ERROR);
+        }
     }
 
     [[nodiscard]] LlmRequest const& getLlmRequest() const
 
@@ -17,6 +17,7 @@
 #include "tensorrt_llm/common/tllmException.h"
 #include "tensorrt_llm/common/stringUtils.h"
 
+#include <cinttypes>
 #include <cstdlib>
 #if !defined(_MSC_VER)
 #include <cxxabi.h>
@@ -106,4 +107,25 @@ std::string TllmException::demangle(char const* name)
 #endif
 }
 
+RequestSpecificException::RequestSpecificException(
+    std::string const& file, std::size_t line, char const* msg, uint64_t requestID, RequestErrorCode errorCode)
+    : std::runtime_error{fmtstr("%s (Request ID: %" PRIu64 ", Error Code: %u) (%s:%zu)", msg, requestID,
+        static_cast<uint32_t>(errorCode), file.c_str(), line)}
+    , mRequestID{requestID}
+    , mErrorCode{errorCode}
+{
+}
+
+RequestSpecificException::~RequestSpecificException() noexcept = default;
+
+uint64_t RequestSpecificException::getRequestId() const noexcept
+{
+    return mRequestID;
+}
+
+RequestErrorCode RequestSpecificException::getErrorCode() const noexcept
+{
+    return mErrorCode;
+}
+
 } // namespace tensorrt_llm::common
@@ -20,11 +20,16 @@
 
 #include "tensorrt_llm/batch_manager/dataTransceiverImpl.h"
 #include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/tllmException.h"
 #include "tensorrt_llm/executor/cache_transmission/ucx_utils/connection.h"
 
 namespace tensorrt_llm::executor::kv_cache
 {
 
+// Using declarations to shorten the code
+using RequestSpecificException = tensorrt_llm::common::RequestSpecificException;
+using RequestErrorCode = tensorrt_llm::common::RequestErrorCode;
+
 UcxConnection::UcxConnection(ConnectionIdType connectionId, std::shared_ptr<ucxx::Endpoint> endpoint,
     UcxConnectionManager* manager, bool fromRequester)
     : mConnectionId(connectionId)
 
@@ -10,6 +10,7 @@ set(SRCS
     batch_manager/kvCacheConnector.cpp
     batch_manager/kvCacheManager.cpp
     batch_manager/llmRequest.cpp
+    common/tllmExceptions.cpp
     executor/bindings.cpp
     executor/executor.cpp
     executor/executorConfig.cpp
 
@@ -37,6 +37,7 @@
 #include "tensorrt_llm/nanobind/batch_manager/kvCacheConnector.h"
 #include "tensorrt_llm/nanobind/batch_manager/kvCacheManager.h"
 #include "tensorrt_llm/nanobind/batch_manager/llmRequest.h"
+#include "tensorrt_llm/nanobind/common/tllmExceptions.h"
 #include "tensorrt_llm/nanobind/executor/bindings.h"
 #include "tensorrt_llm/nanobind/runtime/bindings.h"
 #include "tensorrt_llm/nanobind/testing/modelSpecBinding.h"
@@ -127,9 +128,11 @@ NB_MODULE(TRTLLM_NB_MODULE, m)
     auto mInternalTesting = mInternal.def_submodule("testing", "Testing internal bindings");
     auto mInternalBatchManager = mInternal.def_submodule("batch_manager", "Batch manager internal bindings");
     auto mInternalThop = mInternal.def_submodule("thop", "Torch op internal bindings");
+    auto mExceptions = m.def_submodule("exceptions", "Exceptions internal bindings");
 
     tensorrt_llm::nanobind::executor::initBindings(mExecutor);
     tensorrt_llm::nanobind::runtime::initBindingsEarly(mInternalRuntime);
+    tensorrt_llm::nanobind::common::initExceptionsBindings(mExceptions);
     tensorrt_llm::nanobind::thop::initBindings(mInternalThop);
 
     auto buildInfo = m.def_submodule("BuildInfo");
@@ -471,7 +474,8 @@ NB_MODULE(TRTLLM_NB_MODULE, m)
         .value("DISAGG_CONTEXT_COMPLETE", tb::LlmRequestState::kDISAGG_CONTEXT_COMPLETE)
         .value("DISAGG_GENERATION_TRANS_IN_PROGRESS", tb::LlmRequestState::kDISAGG_GENERATION_TRANS_IN_PROGRESS)
         .value("DISAGG_GENERATION_TRANS_COMPLETE", tb::LlmRequestState::kDISAGG_GENERATION_TRANS_COMPLETE)
-        .value("DISAGG_CONTEXT_INIT_AND_TRANS", tb::LlmRequestState::kDISAGG_CONTEXT_INIT_AND_TRANS);
+        .value("DISAGG_CONTEXT_INIT_AND_TRANS", tb::LlmRequestState::kDISAGG_CONTEXT_INIT_AND_TRANS)
+        .value("DISAGG_TRANS_ERROR", tb::LlmRequestState::kDISAGG_TRANS_ERROR);
 
     nb::class_<tr::MemoryCounters>(m, "MemoryCounters")
         .def_static("instance", &tr::MemoryCounters::getInstance, nb::rv_policy::reference)
 
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tllmExceptions.h"
+#include "tensorrt_llm/common/tllmException.h"
+#include <nanobind/nanobind.h>
+
+namespace tc = tensorrt_llm::common;
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::common
+{
+
+void initExceptionsBindings(nb::module_& m)
+{
+    // Bind the RequestErrorCode enum
+    nb::enum_<tc::RequestErrorCode>(m, "RequestErrorCode")
+        .value("UNKNOWN_ERROR", tc::RequestErrorCode::kUNKNOWN_ERROR)
+        .value("NETWORK_ERROR", tc::RequestErrorCode::kNETWORK_ERROR)
+        .export_values();
+
+    // Create the RequestSpecificException Python exception class
+    static nb::object request_specific_exc = nb::exception<tc::RequestSpecificException>(m, "RequestSpecificException");
+
+    // Add attributes to the Python exception class
+    request_specific_exc.attr("request_id") = nb::none();
+    request_specific_exc.attr("error_code") = nb::none();
+
+    // Register exception translator to convert C++ exceptions to Python
+    nb::register_exception_translator(
+        [](std::exception_ptr const& p, void*)
+        {
+            try
+            {
+                if (p)
+                    std::rethrow_exception(p);
+            }
+            catch (const tc::RequestSpecificException& e)
+            {
+                // Create a Python exception with the request ID and error code information
+                nb::object py_exc = nb::cast(e);
+                nb::object request_id = nb::cast(e.getRequestId());
+                nb::object error_code = nb::cast(static_cast<uint32_t>(e.getErrorCode()));
+
+                // Set additional attributes on the exception
+                py_exc.attr("request_id") = request_id;
+                py_exc.attr("error_code") = error_code;
+
+                PyErr_SetObject(request_specific_exc.ptr(), py_exc.ptr());
+            }
+        });
+}
+
+} // namespace tensorrt_llm::nanobind::common