-
Notifications
You must be signed in to change notification settings - Fork 1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[bug] nccl allgather hung #2560
Comments
Do you have places inside Executor that call nccl |
I inserted sync after nccl allgather void ncclStreamSynchronize(cudaStream_t stream, ncclComm_t comm) {
cudaError_t cudaErr;
ncclResult_t ncclErr, ncclAsyncErr;
while (1) {
cudaErr = cudaStreamQuery(stream);
if (cudaErr == cudaSuccess)
return;
if (cudaErr != cudaErrorNotReady) {
TLLM_CUDA_CHECK(cudaErr);
}
ncclErr = ncclCommGetAsyncError(comm, &ncclAsyncErr);
if (ncclErr != ncclSuccess) {
NCCLCHECK(ncclErr);
}
if (ncclAsyncErr != ncclSuccess) {
NCCLCHECK(ncclAsyncErr);
}
// We might want to let other threads (including NCCL threads) use the CPU.
std::this_thread::yield();
}
} TLLM_CHECK_WITH_INFO(mNcclComm.get() != nullptr, "mNcclComm should be initialized before used");
NCCLCHECK(ncclAllGather(inputs[0], outputs[0], size, (*getDtypeMap())[inputDesc[0].type], *mNcclComm, stream));
+ ncclStreamSynchronize(stream, *mNcclComm); It freezes again after about 3kk requests. backtraces rank0
rank1
rank2
rank3
trtllm version: https://github.com/NVIDIA/TensorRT-LLM/tree/340a1b62fc7a4124d6efef10df695b263734ab2f |
--- a/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.cpp
@@ -23,6 +23,34 @@ using namespace nvinfer1;
using tensorrt_llm::plugins::AllgatherPluginCreator;
using tensorrt_llm::plugins::AllgatherPlugin;
+namespace {
+ void ncclStreamSynchronize(cudaStream_t stream, ncclComm_t comm) {
+ cudaError_t cudaErr;
+ ncclResult_t ncclErr, ncclAsyncErr;
+ while (1) {
+ cudaErr = cudaStreamQuery(stream);
+ if (cudaErr == cudaSuccess)
+ return;
+
+ if (cudaErr != cudaErrorNotReady) {
+ TLLM_CUDA_CHECK(cudaErr);
+ }
+
+ ncclErr = ncclCommGetAsyncError(comm, &ncclAsyncErr);
+ if (ncclErr != ncclSuccess) {
+ NCCLCHECK(ncclErr);
+ }
+
+ if (ncclAsyncErr != ncclSuccess) {
+ NCCLCHECK(ncclAsyncErr);
+ }
+
+ // We might want to let other threads (including NCCL threads) use the CPU.
+ std::this_thread::yield();
+ }
+ }
+}
+
static char const* ALLGATHER_PLUGIN_VERSION{"1"};
static char const* ALLGATHER_PLUGIN_NAME{"AllGather"};
PluginFieldCollection AllgatherPluginCreator::mFC{};
@@ -101,8 +129,14 @@ int AllgatherPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfe
size *= inputDesc[0].dims.d[i];
}
+ TLLM_LOG_WARNING("rank: %d start enqueue", COMM_SESSION.getRank());
+ ncclStreamSynchronize(stream, *mNcclComm);
+ TLLM_LOG_WARNING("rank: %d start ncclAllGather, size: %zu, dtype: %d", COMM_SESSION.getRank(), size, static_cast<int>(inputDesc[0].type));
TLLM_CHECK_WITH_INFO(mNcclComm.get() != nullptr, "mNcclComm should be initialized before used");
NCCLCHECK(ncclAllGather(inputs[0], outputs[0], size, (*getDtypeMap())[inputDesc[0].type], *mNcclComm, stream));
+ TLLM_LOG_WARNING("rank: %d end ncclAllGather", COMM_SESSION.getRank());
+ ncclStreamSynchronize(stream, *mNcclComm);
+ TLLM_LOG_WARNING("rank: %d end enqueue", COMM_SESSION.getRank());
return 0;
} Hang logs
cuda-gdb rank0 (cuda-gdb) info cuda kernels
Kernel Parent Dev Grid Status SMs Mask GridDim BlockDim Invocation
* 0 - 0 4347595797 Active 0x000000000000000000000000000fff0fff (24,1,1) (544,1,1) ncclDevKernel_AllGather_RING_LL()
(cuda-gdb) bt
#0 0x00007fce839d3b40 in ncclDevFunc_AllGather_RING_SIMPLE() ()
#1 0x00007fc563fcd000 in ncclDevKernel_AllGather_RING_LL(ncclDevKernelArgsStorage<4096ul>)<<<(24,1,1),(544,1,1)>>> ()
(cuda-gdb) thread apply all bt
Thread 20 (Thread 0x7fc4affff000 (LWP 3318470) "executor_server"):
#0 0x00007fd0b01b8307 in epoll_wait () from /lib64/libc.so.6
#1 0x00007fc4f8578984 in ?? () from /lib64/libcudadebugger.so.1
#2 0x00007fc4f8579462 in ?? () from /lib64/libcudadebugger.so.1
#3 0x00007fc4f8577610 in ?? () from /lib64/libcudadebugger.so.1
#4 0x00007fc4f8580aba in ?? () from /lib64/libcudadebugger.so.1
#5 0x00007fc4f8580fbc in ?? () from /lib64/libcudadebugger.so.1
#6 0x00007fc4f8466020 in ?? () from /lib64/libcudadebugger.so.1
#7 0x00007fc4f85c0997 in ?? () from /lib64/libcudadebugger.so.1
#8 0x00007fd0b0d861ca in start_thread () from /lib64/libpthread.so.0
#9 0x00007fd0b00b28d3 in clone () from /lib64/libc.so.6
Thread 18 (Thread 0x7fc539fff000 (LWP 3913243) "executor_server"):
#0 0x00007fd0b0d8c48c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007fd0b0aa5870 in std::condition_variable::wait(std::unique_lock<std::mutex>&) () from /lib64/libstdc++.so.6
#2 0x00000000004370b6 in writerWorker(modules::util::FileDescriptor, std::shared_ptr<modules::util::MTQueue<modules::executor_server::Response> >) ()
#3 0x0000000000440836 in std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (*)(modules::util::FileDescriptor, std::shared_ptr<modules::util::MTQueue<modules::executor_server::Response> >), modules::util::FileDescriptor,
std::shared_ptr<modules::util::MTQueue<modules::executor_server::Response> > > > >::_M_run() ()
#4 0x00007fd0fd6ef930 in execute_native_thread_routine () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
#5 0x00007fd0b0d861ca in start_thread () from /lib64/libpthread.so.0
#6 0x00007fd0b00b28d3 in clone () from /lib64/libc.so.6
Thread 17 (Thread 0x7fc5408fc000 (LWP 3913242) "executor_server"):
#0 0x00007fd0b0d8c7da in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007fd113e0fbff in tensorrt_llm::executor::Executor::Impl::awaitResponses(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > > const&) () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#2 0x00007fd113dfffdd in tensorrt_llm::executor::Executor::awaitResponses(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > > const&) () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#3 0x0000000000466f18 in modules::executor_server::Executor::ExecutorImpl::awaitRoutine(std::stop_token) ()
#4 0x0000000000468100 in std::thread::_State_impl<std::thread::_Invoker<std::tuple<modules::executor_server::Executor::ExecutorImpl::ExecutorImpl(modules::executor_server::MainRank, std::filesystem::__cxx11::path const&,
modules::executor_server::ExecutorConfig const&, modules::executor_server::LogitsProcessorStaticConfig const&)::{lambda(std::stop_token)#1}, std::stop_token> > >::_M_run() ()
#5 0x00007fd0fd6ef930 in execute_native_thread_routine () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
#6 0x00007fd0b0d861ca in start_thread () from /lib64/libpthread.so.0
#7 0x00007fd0b00b28d3 in clone () from /lib64/libc.so.6
Thread 16 (Thread 0x7fc5410fd000 (LWP 3913240) "executionLoop"):
#0 0x00007fd0b00b241d in syscall () from /lib64/libc.so.6
#1 0x00007fc4f85bf730 in ?? () from /lib64/libcudadebugger.so.1
#2 0x00007fc4f81585ec in ?? () from /lib64/libcudadebugger.so.1
#3 0x00007fc4f8274bd6 in ?? () from /lib64/libcudadebugger.so.1
#4 0x00007fd0b28ee21f in ?? () from /lib64/libcuda.so.1
#5 0x00007fd12f2b0135 in libcudart_static_7304beb15f17907bccdc6b7603b23d2a2951bc68 () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so
#6 0x00007fd12f313acd in cudaStreamQuery () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so
#7 0x00007fd12f28752b in (anonymous namespace)::ncclStreamSynchronize(CUstream_st*, ncclComm*) () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so
#8 0x00007fd12f28853c in tensorrt_llm::plugins::AllgatherPlugin::enqueue(nvinfer1::PluginTensorDesc const*, nvinfer1::PluginTensorDesc const*, void const* const*, void* const*, void*, CUstream_st*) () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so
#9 0x00007fd0c4a83fec in ?? () from /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.10
#10 0x00007fd0c49f4a55 in ?? () from /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.10
#11 0x00007fd0c49f6609 in ?? () from /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.10
#12 0x00007fd1139863a4 in tensorrt_llm::runtime::TllmRuntime::executeContext(int) const () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#13 0x00007fd113d7208b in tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeContext(int, int) () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#14 0x00007fd113d810b6 in tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeStep(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> >
> const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int) () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#15 0x00007fd113d818de in tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeBatch(tensorrt_llm::batch_manager::ScheduledRequests const&) () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#16 0x00007fd113d8201b in tensorrt_llm::batch_manager::TrtGptModelInflightBatching::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>,
std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&) () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#17 0x00007fd113e174a1 in tensorrt_llm::executor::Executor::Impl::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&) ()
from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#18 0x00007fd113e1e5bc in tensorrt_llm::executor::Executor::Impl::executionLoop() () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#19 0x00007fd0fd6ef930 in execute_native_thread_routine () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
--Type <RET> for more, q to quit, c to continue without paging--
#20 0x00007fd0b0d861ca in start_thread () from /lib64/libpthread.so.0
#21 0x00007fd0b00b28d3 in clone () from /lib64/libc.so.6
Thread 15 (Thread 0x7fc5418fe000 (LWP 3913237) "dataTransResp"):
#0 0x00007fd0b0d8c48c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007fd0b0aa5870 in std::condition_variable::wait(std::unique_lock<std::mutex>&) () from /lib64/libstdc++.so.6
#2 0x00007fd113cfba6b in tensorrt_llm::batch_manager::DataResponder::Impl::response() () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#3 0x00007fd113cf9c3d in std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result<void>,
std::__future_base::_Result_base::_Deleter>, std::thread::_Invoker<std::tuple<void (tensorrt_llm::batch_manager::DataResponder::Impl::*)(), tensorrt_llm::batch_manager::DataResponder::Impl*> >, void> >::_M_invoke(std::_Any_data const&) ()
from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#4 0x000000000043821b in std::__future_base::_State_baseV2::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*) ()
#5 0x00007fd0b0d8de97 in __pthread_once_slow () from /lib64/libpthread.so.0
#6 0x00007fd113cfaa8d in std::thread::_State_impl<std::thread::_Invoker<std::tuple<std::__future_base::_Async_state_impl<std::thread::_Invoker<std::tuple<void (tensorrt_llm::batch_manager::DataResponder::Impl::*)(),
tensorrt_llm::batch_manager::DataResponder::Impl*> >, void>::_Async_state_impl(std::thread::_Invoker<std::tuple<void (tensorrt_llm::batch_manager::DataResponder::Impl::*)(), tensorrt_llm::batch_manager::DataResponder::Impl*>
>&&)::{lambda()#1}> > >::_M_run() () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#7 0x00007fd0fd6ef930 in execute_native_thread_routine () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
#8 0x00007fd0b0d861ca in start_thread () from /lib64/libpthread.so.0
#9 0x00007fd0b00b28d3 in clone () from /lib64/libc.so.6
Thread 14 (Thread 0x7fc564ba8000 (LWP 3913229) "executor_server"):
#0 0x00007fd0b0d8c48c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007fd0b4847727 in ncclProxyProgress(void*) () from /lib64/libnccl.so.2
#2 0x00007fd0b0d861ca in start_thread () from /lib64/libpthread.so.0
#3 0x00007fd0b00b28d3 in clone () from /lib64/libc.so.6
Thread 13 (Thread 0x7fc5653a9000 (LWP 3913226) "executor_server"):
#0 0x00007fd0b01abac1 in poll () from /lib64/libc.so.6
#1 0x00007fd0b484647a in ncclProxyServiceUDS(void*) () from /lib64/libnccl.so.2
#2 0x00007fd0b0d861ca in start_thread () from /lib64/libpthread.so.0
#3 0x00007fd0b00b28d3 in clone () from /lib64/libc.so.6
Thread 12 (Thread 0x7fce88e67000 (LWP 3913225) "executor_server"):
#0 0x00007fd0b01abac1 in poll () from /lib64/libc.so.6
#1 0x00007fd0b4847dc2 in ncclProxyService(void*) () from /lib64/libnccl.so.2
#2 0x00007fd0b0d861ca in start_thread () from /lib64/libpthread.so.0
#3 0x00007fd0b00b28d3 in clone () from /lib64/libc.so.6
Thread 11 (Thread 0x7fc574ffd000 (LWP 3913202) "executor_server"):
#0 0x00007fd0b0d8fae4 in read () from /lib64/libpthread.so.0
#1 0x00007fd09e885fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007fd0b48780d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
#3 0x00007fd0b489ac04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
#4 0x00007fd0b0d861ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007fd0b00b28d3 in clone () from /lib64/libc.so.6
Thread 10 (Thread 0x7fc5757fe000 (LWP 3913199) "executor_server"):
#0 0x00007fd0b0d8fae4 in read () from /lib64/libpthread.so.0
#1 0x00007fd09e885fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007fd0b48780d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
#3 0x00007fd0b489ac04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
#4 0x00007fd0b0d861ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007fd0b00b28d3 in clone () from /lib64/libc.so.6
Thread 9 (Thread 0x7fc575fff000 (LWP 3913197) "executor_server"):
#0 0x00007fd0b0d8fae4 in read () from /lib64/libpthread.so.0
#1 0x00007fd09e885fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007fd0b48780d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
#3 0x00007fd0b489ac04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
#4 0x00007fd0b0d861ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007fd0b00b28d3 in clone () from /lib64/libc.so.6
Thread 8 (Thread 0x7fc9e6e05000 (LWP 3913195) "executor_server"):
#0 0x00007fd0b0d8fae4 in read () from /lib64/libpthread.so.0
#1 0x00007fd09e885fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007fd0b48780d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
--Type <RET> for more, q to quit, c to continue without paging--
#3 0x00007fd0b489ac04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
#4 0x00007fd0b0d861ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007fd0b00b28d3 in clone () from /lib64/libc.so.6
Thread 7 (Thread 0x7fc9e7606000 (LWP 3913189) "executor_server"):
#0 0x00007fd0b0d8fae4 in read () from /lib64/libpthread.so.0
#1 0x00007fd09e885fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007fd0b48780d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
#3 0x00007fd0b489ac04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
#4 0x00007fd0b0d861ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007fd0b00b28d3 in clone () from /lib64/libc.so.6
Thread 6 (Thread 0x7fc9e7e07000 (LWP 3913187) "executor_server"):
#0 0x00007fd0b0d8fae4 in read () from /lib64/libpthread.so.0
#1 0x00007fd09e885fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007fd0b48780d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
#3 0x00007fd0b489ac04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
#4 0x00007fd0b0d861ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007fd0b00b28d3 in clone () from /lib64/libc.so.6
Thread 5 (Thread 0x7fce8b55e000 (LWP 3913173) "cuda-EvtHandlr"):
#0 0x00007fd0b0d8c48c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007fc4f85794ba in ?? () from /lib64/libcudadebugger.so.1
#2 0x00007fc4f8577610 in ?? () from /lib64/libcudadebugger.so.1
#3 0x00007fc4f818e225 in ?? () from /lib64/libcudadebugger.so.1
#4 0x00007fc4f81e2961 in ?? () from /lib64/libcudadebugger.so.1
#5 0x00007fc4f8173a9f in ?? () from /lib64/libcudadebugger.so.1
#6 0x00007fc4f8274bd6 in ?? () from /lib64/libcudadebugger.so.1
#7 0x00007fd0b2a9e42b in ?? () from /lib64/libcuda.so.1
#8 0x00007fd0b293b184 in ?? () from /lib64/libcuda.so.1
#9 0x00007fd0b28640f3 in ?? () from /lib64/libcuda.so.1
#10 0x00007fd0b0d861ca in start_thread () from /lib64/libpthread.so.0
#11 0x00007fd0b00b28d3 in clone () from /lib64/libc.so.6
Thread 4 (Thread 0x7fd09a759000 (LWP 3913167) "cuda00006000019"):
#0 0x00007fd0b01abac1 in poll () from /lib64/libc.so.6
#1 0x00007fd0b2867e4f in ?? () from /lib64/libcuda.so.1
#2 0x00007fd0b293b12f in ?? () from /lib64/libcuda.so.1
#3 0x00007fd0b28640f3 in ?? () from /lib64/libcuda.so.1
#4 0x00007fd0b0d861ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007fd0b00b28d3 in clone () from /lib64/libc.so.6
Thread 3 (Thread 0x7fd0a658a000 (LWP 3913159) "executor_server"):
#0 0x00007fd0b01b8307 in epoll_wait () from /lib64/libc.so.6
#1 0x00007fd0aebcf509 in ?? () from /lib64/libevent_core-2.1.so.6
#2 0x00007fd0aebc51e8 in event_base_loop () from /lib64/libevent_core-2.1.so.6
#3 0x00007fd0ac7f9276 in ?? () from /lib64/libpmix.so.2
#4 0x00007fd0b0d861ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007fd0b00b28d3 in clone () from /lib64/libc.so.6
Thread 2 (Thread 0x7fd0ad491000 (LWP 3913155) "executor_server"):
#0 0x00007fd0b01abac1 in poll () from /lib64/libc.so.6
#1 0x00007fd0aebce8e9 in ?? () from /lib64/libevent_core-2.1.so.6
#2 0x00007fd0aebc51e8 in event_base_loop () from /lib64/libevent_core-2.1.so.6
#3 0x00007fd0af265d96 in ?? () from /usr/lib64/openmpi/lib/libopen-pal.so.40
#4 0x00007fd0b0d861ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007fd0b00b28d3 in clone () from /lib64/libc.so.6
Thread 1 (Thread 0x7fd134703000 (LWP 3913149) "executor_server"):
#0 0x00007fd0b0d8fae4 in read () from /lib64/libpthread.so.0
#1 0x0000000000543b1a in google::protobuf::io::FileInputStream::CopyingFileInputStream::Read(void*, int) ()
#2 0x0000000000548812 in google::protobuf::io::CopyingInputStreamAdaptor::Next(void const**, int*) ()
#3 0x00000000005418a7 in google::protobuf::io::CodedInputStream::Refresh() [clone .part.0] ()
#4 0x000000000043674e in readerWorker(modules::util::FileDescriptor, modules::executor_server::Executor&, std::shared_ptr<modules::util::LoraLoader>, unsigned long, std::function<void (modules::executor_server::Response&&)> const&) ()
#5 0x000000000042d3bb in main ()
cuda-gdb rank1 (cuda-gdb) info cuda kernels
No CUDA kernels.
(cuda-gdb) thread apply all bt
Thread 19 (Thread 0x7f75a7fff000 (LWP 3273909) "executor_server"):
#0 0x00007f81cb05c307 in epoll_wait () from /lib64/libc.so.6
#1 0x00007f75c2578984 in ?? () from /lib64/libcudadebugger.so.1
#2 0x00007f75c2579462 in ?? () from /lib64/libcudadebugger.so.1
#3 0x00007f75c2577610 in ?? () from /lib64/libcudadebugger.so.1
#4 0x00007f75c2580aba in ?? () from /lib64/libcudadebugger.so.1
#5 0x00007f75c2580fbc in ?? () from /lib64/libcudadebugger.so.1
#6 0x00007f75c2466020 in ?? () from /lib64/libcudadebugger.so.1
#7 0x00007f75c25c0997 in ?? () from /lib64/libcudadebugger.so.1
#8 0x00007f81cbc2a1ca in start_thread () from /lib64/libpthread.so.0
#9 0x00007f81caf568d3 in clone () from /lib64/libc.so.6
Thread 17 (Thread 0x7f76417fe000 (LWP 3913348) "cuda-EvtHandlr"):
#0 0x00007f81cbc3048c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007f75c25794ba in ?? () from /lib64/libcudadebugger.so.1
#2 0x00007f75c2577610 in ?? () from /lib64/libcudadebugger.so.1
#3 0x00007f75c218e225 in ?? () from /lib64/libcudadebugger.so.1
#4 0x00007f75c21e2961 in ?? () from /lib64/libcudadebugger.so.1
#5 0x00007f75c2173a9f in ?? () from /lib64/libcudadebugger.so.1
#6 0x00007f75c2274bd6 in ?? () from /lib64/libcudadebugger.so.1
#7 0x00007f81cd94242b in ?? () from /lib64/libcuda.so.1
#8 0x00007f81cd7df184 in ?? () from /lib64/libcuda.so.1
#9 0x00007f81cd7080f3 in ?? () from /lib64/libcuda.so.1
#10 0x00007f81cbc2a1ca in start_thread () from /lib64/libpthread.so.0
#11 0x00007f81caf568d3 in clone () from /lib64/libc.so.6
Thread 16 (Thread 0x7f7641fff000 (LWP 3913241) "executionLoop"):
#0 0x00007f81caf5641d in syscall () from /lib64/libc.so.6
#1 0x00007f75c25bf730 in ?? () from /lib64/libcudadebugger.so.1
#2 0x00007f75c21585ec in ?? () from /lib64/libcudadebugger.so.1
#3 0x00007f75c2274bd6 in ?? () from /lib64/libcudadebugger.so.1
#4 0x00007f81cd79221f in ?? () from /lib64/libcuda.so.1
#5 0x00007f824a154135 in libcudart_static_7304beb15f17907bccdc6b7603b23d2a2951bc68 () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so
#6 0x00007f824a1b7acd in cudaStreamQuery () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so
#7 0x00007f824a12b52b in (anonymous namespace)::ncclStreamSynchronize(CUstream_st*, ncclComm*) () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so
#8 0x00007f824a12c53c in tensorrt_llm::plugins::AllgatherPlugin::enqueue(nvinfer1::PluginTensorDesc const*, nvinfer1::PluginTensorDesc const*, void const* const*, void* const*, void*, CUstream_st*) () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so
#9 0x00007f81df927fec in ?? () from /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.10
#10 0x00007f81df898a55 in ?? () from /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.10
#11 0x00007f81df89a609 in ?? () from /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.10
#12 0x00007f822e82a3a4 in tensorrt_llm::runtime::TllmRuntime::executeContext(int) const () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#13 0x00007f822ec1608b in tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeContext(int, int) () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#14 0x00007f822ec250b6 in tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeStep(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> >
> const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int) () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#15 0x00007f822ec258de in tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeBatch(tensorrt_llm::batch_manager::ScheduledRequests const&) () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#16 0x00007f822ec2601b in tensorrt_llm::batch_manager::TrtGptModelInflightBatching::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>,
std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&) () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#17 0x00007f822ecbb4a1 in tensorrt_llm::executor::Executor::Impl::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&) ()
from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#18 0x00007f822ecc25bc in tensorrt_llm::executor::Executor::Impl::executionLoop() () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#19 0x00007f8218593930 in execute_native_thread_routine () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
#20 0x00007f81cbc2a1ca in start_thread () from /lib64/libpthread.so.0
#21 0x00007f81caf568d3 in clone () from /lib64/libc.so.6
Thread 15 (Thread 0x7f76609f8000 (LWP 3913235) "dataTransResp"):
#0 0x00007f81cbc3048c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007f81cb949870 in std::condition_variable::wait(std::unique_lock<std::mutex>&) () from /lib64/libstdc++.so.6
#2 0x00007f822eb9fa6b in tensorrt_llm::batch_manager::DataResponder::Impl::response() () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#3 0x00007f822eb9dc3d in std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result<void>,
std::__future_base::_Result_base::_Deleter>, std::thread::_Invoker<std::tuple<void (tensorrt_llm::batch_manager::DataResponder::Impl::*)(), tensorrt_llm::batch_manager::DataResponder::Impl*> >, void> >::_M_invoke(std::_Any_data const&) ()
from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#4 0x000000000043821b in std::__future_base::_State_baseV2::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*) ()
--Type <RET> for more, q to quit, c to continue without paging--
#5 0x00007f81cbc31e97 in __pthread_once_slow () from /lib64/libpthread.so.0
#6 0x00007f822eb9ea8d in std::thread::_State_impl<std::thread::_Invoker<std::tuple<std::__future_base::_Async_state_impl<std::thread::_Invoker<std::tuple<void (tensorrt_llm::batch_manager::DataResponder::Impl::*)(),
tensorrt_llm::batch_manager::DataResponder::Impl*> >, void>::_Async_state_impl(std::thread::_Invoker<std::tuple<void (tensorrt_llm::batch_manager::DataResponder::Impl::*)(), tensorrt_llm::batch_manager::DataResponder::Impl*>
>&&)::{lambda()#1}> > >::_M_run() () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#7 0x00007f8218593930 in execute_native_thread_routine () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
#8 0x00007f81cbc2a1ca in start_thread () from /lib64/libpthread.so.0
#9 0x00007f81caf568d3 in clone () from /lib64/libc.so.6
Thread 14 (Thread 0x7f76618fa000 (LWP 3913228) "executor_server"):
#0 0x00007f81cbc3048c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007f81cf6eb727 in ncclProxyProgress(void*) () from /lib64/libnccl.so.2
#2 0x00007f81cbc2a1ca in start_thread () from /lib64/libpthread.so.0
#3 0x00007f81caf568d3 in clone () from /lib64/libc.so.6
Thread 13 (Thread 0x7f76627fc000 (LWP 3913222) "executor_server"):
#0 0x00007f81cb04fac1 in poll () from /lib64/libc.so.6
#1 0x00007f81cf6ea47a in ncclProxyServiceUDS(void*) () from /lib64/libnccl.so.2
#2 0x00007f81cbc2a1ca in start_thread () from /lib64/libpthread.so.0
#3 0x00007f81caf568d3 in clone () from /lib64/libc.so.6
Thread 12 (Thread 0x7f7662ffd000 (LWP 3913220) "executor_server"):
#0 0x00007f81cb04fac1 in poll () from /lib64/libc.so.6
#1 0x00007f81cf6ebdc2 in ncclProxyService(void*) () from /lib64/libnccl.so.2
#2 0x00007f81cbc2a1ca in start_thread () from /lib64/libpthread.so.0
#3 0x00007f81caf568d3 in clone () from /lib64/libc.so.6
Thread 11 (Thread 0x7f76637fe000 (LWP 3913204) "executor_server"):
#0 0x00007f81cbc33ae4 in read () from /lib64/libpthread.so.0
#1 0x00007f81bd827fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f81cf71c0d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
#3 0x00007f81cf73ec04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
#4 0x00007f81cbc2a1ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007f81caf568d3 in clone () from /lib64/libc.so.6
Thread 10 (Thread 0x7f7663fff000 (LWP 3913200) "executor_server"):
#0 0x00007f81cbc33ae4 in read () from /lib64/libpthread.so.0
#1 0x00007f81bd827fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f81cf71c0d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
#3 0x00007f81cf73ec04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
#4 0x00007f81cbc2a1ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007f81caf568d3 in clone () from /lib64/libc.so.6
Thread 9 (Thread 0x7f7686ba8000 (LWP 3913193) "executor_server"):
#0 0x00007f81cbc33ae4 in read () from /lib64/libpthread.so.0
#1 0x00007f81bd827fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f81cf71c0d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
#3 0x00007f81cf73ec04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
#4 0x00007f81cbc2a1ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007f81caf568d3 in clone () from /lib64/libc.so.6
Thread 8 (Thread 0x7f76873a9000 (LWP 3913191) "executor_server"):
#0 0x00007f81cbc33ae4 in read () from /lib64/libpthread.so.0
#1 0x00007f81bd827fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f81cf71c0d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
#3 0x00007f81cf73ec04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
#4 0x00007f81cbc2a1ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007f81caf568d3 in clone () from /lib64/libc.so.6
Thread 7 (Thread 0x7f7b02d31000 (LWP 3913190) "executor_server"):
#0 0x00007f81cbc33ae4 in read () from /lib64/libpthread.so.0
#1 0x00007f81bd827fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f81cf71c0d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
#3 0x00007f81cf73ec04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
#4 0x00007f81cbc2a1ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007f81caf568d3 in clone () from /lib64/libc.so.6
--Type <RET> for more, q to quit, c to continue without paging--
Thread 6 (Thread 0x7f7b03532000 (LWP 3913188) "executor_server"):
#0 0x00007f81cbc33ae4 in read () from /lib64/libpthread.so.0
#1 0x00007f81bd827fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f81cf71c0d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
#3 0x00007f81cf73ec04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
#4 0x00007f81cbc2a1ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007f81caf568d3 in clone () from /lib64/libc.so.6
Thread 5 (Thread 0x7f7fa62ce000 (LWP 3913177) "cuda-EvtHandlr"):
#0 0x00007f81cb04fac1 in poll () from /lib64/libc.so.6
#1 0x00007f81cd70be4f in ?? () from /lib64/libcuda.so.1
#2 0x00007f81cd7df12f in ?? () from /lib64/libcuda.so.1
#3 0x00007f81cd7080f3 in ?? () from /lib64/libcuda.so.1
#4 0x00007f81cbc2a1ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007f81caf568d3 in clone () from /lib64/libc.so.6
Thread 4 (Thread 0x7f81b555b000 (LWP 3913170) "cuda00006000019"):
#0 0x00007f81cb04fac1 in poll () from /lib64/libc.so.6
#1 0x00007f81cd70be4f in ?? () from /lib64/libcuda.so.1
#2 0x00007f81cd7df12f in ?? () from /lib64/libcuda.so.1
#3 0x00007f81cd7080f3 in ?? () from /lib64/libcuda.so.1
#4 0x00007f81cbc2a1ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007f81caf568d3 in clone () from /lib64/libc.so.6
Thread 3 (Thread 0x7f81c5520000 (LWP 3913160) "executor_server"):
#0 0x00007f81cb05c307 in epoll_wait () from /lib64/libc.so.6
#1 0x00007f81c9a73509 in ?? () from /lib64/libevent_core-2.1.so.6
#2 0x00007f81c9a691e8 in event_base_loop () from /lib64/libevent_core-2.1.so.6
#3 0x00007f81c769d276 in ?? () from /lib64/libpmix.so.2
#4 0x00007f81cbc2a1ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007f81caf568d3 in clone () from /lib64/libc.so.6
Thread 2 (Thread 0x7f81c8335000 (LWP 3913156) "executor_server"):
#0 0x00007f81cb04fac1 in poll () from /lib64/libc.so.6
#1 0x00007f81c9a728e9 in ?? () from /lib64/libevent_core-2.1.so.6
#2 0x00007f81c9a691e8 in event_base_loop () from /lib64/libevent_core-2.1.so.6
#3 0x00007f81ca109d96 in ?? () from /usr/lib64/openmpi/lib/libopen-pal.so.40
#4 0x00007f81cbc2a1ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007f81caf568d3 in clone () from /lib64/libc.so.6
Thread 1 (Thread 0x7f824f5a7000 (LWP 3913150) "executor_server"):
#0 0x00007f81cbc2b6cd in __pthread_timedjoin_ex () from /lib64/libpthread.so.0
#1 0x00007f81cb94fda7 in std::thread::join() () from /lib64/libstdc++.so.6
#2 0x00007f822ecbf009 in tensorrt_llm::executor::Executor::Impl::shutdown() () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#3 0x00007f822ecbf0f6 in tensorrt_llm::executor::Executor::Impl::~Impl() () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#4 0x00007f822eca3f81 in tensorrt_llm::executor::Executor::~Executor() () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#5 0x000000000046247a in modules::executor_server::Executor::ExecutorImpl::~ExecutorImpl() ()
#6 0x000000000046e701 in modules::executor_server::Executor::~Executor() ()
#7 0x0000000000440381 in std::unique_ptr<modules::executor_server::Executor, std::default_delete<modules::executor_server::Executor> >::~unique_ptr() ()
#8 0x000000000042d4fc in main () cuda-gdb rank2 (cuda-gdb) info cuda kernels
No CUDA kernels.
(cuda-gdb) thread apply all bt
Thread 19 (Thread 0x7fb9a902b000 (LWP 3272138) "executor_server"):
#0 0x00007fc5b12a9307 in epoll_wait () from /lib64/libc.so.6
#1 0x00007fb9a955d984 in ?? () from /lib64/libcudadebugger.so.1
#2 0x00007fb9a955e462 in ?? () from /lib64/libcudadebugger.so.1
#3 0x00007fb9a955c610 in ?? () from /lib64/libcudadebugger.so.1
#4 0x00007fb9a9565aba in ?? () from /lib64/libcudadebugger.so.1
#5 0x00007fb9a9565fbc in ?? () from /lib64/libcudadebugger.so.1
#6 0x00007fb9a944b020 in ?? () from /lib64/libcudadebugger.so.1
#7 0x00007fb9a95a5997 in ?? () from /lib64/libcudadebugger.so.1
#8 0x00007fc5b1e771ca in start_thread () from /lib64/libpthread.so.0
#9 0x00007fc5b11a38d3 in clone () from /lib64/libc.so.6
Thread 17 (Thread 0x7fba3d9fa000 (LWP 3913346) "cuda-EvtHandlr"):
#0 0x00007fc5b1e7d48c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007fb9a955e4ba in ?? () from /lib64/libcudadebugger.so.1
#2 0x00007fb9a955c610 in ?? () from /lib64/libcudadebugger.so.1
#3 0x00007fb9a9173225 in ?? () from /lib64/libcudadebugger.so.1
#4 0x00007fb9a91c7961 in ?? () from /lib64/libcudadebugger.so.1
#5 0x00007fb9a9158a9f in ?? () from /lib64/libcudadebugger.so.1
#6 0x00007fb9a9259bd6 in ?? () from /lib64/libcudadebugger.so.1
#7 0x00007fc5b3b8f42b in ?? () from /lib64/libcuda.so.1
#8 0x00007fc5b3a2c184 in ?? () from /lib64/libcuda.so.1
#9 0x00007fc5b39550f3 in ?? () from /lib64/libcuda.so.1
#10 0x00007fc5b1e771ca in start_thread () from /lib64/libpthread.so.0
#11 0x00007fc5b11a38d3 in clone () from /lib64/libc.so.6
Thread 16 (Thread 0x7fba3e1fb000 (LWP 3913238) "executionLoop"):
#0 0x00007fc5b11a2b8b in sched_yield () from /lib64/libc.so.6
#1 0x00007fc630378523 in (anonymous namespace)::ncclStreamSynchronize(CUstream_st*, ncclComm*) () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so
#2 0x00007fc63037953c in tensorrt_llm::plugins::AllgatherPlugin::enqueue(nvinfer1::PluginTensorDesc const*, nvinfer1::PluginTensorDesc const*, void const* const*, void* const*, void*, CUstream_st*) () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so
#3 0x00007fc5c5b74fec in ?? () from /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.10
#4 0x00007fc5c5ae5a55 in ?? () from /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.10
#5 0x00007fc5c5ae7609 in ?? () from /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.10
#6 0x00007fc614a773a4 in tensorrt_llm::runtime::TllmRuntime::executeContext(int) const () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#7 0x00007fc614e6308b in tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeContext(int, int) () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#8 0x00007fc614e720b6 in tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeStep(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> >
> const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int) () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#9 0x00007fc614e728de in tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeBatch(tensorrt_llm::batch_manager::ScheduledRequests const&) () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#10 0x00007fc614e7301b in tensorrt_llm::batch_manager::TrtGptModelInflightBatching::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>,
std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&) () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#11 0x00007fc614f084a1 in tensorrt_llm::executor::Executor::Impl::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&) ()
from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#12 0x00007fc614f0f5bc in tensorrt_llm::executor::Executor::Impl::executionLoop() () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#13 0x00007fc5fe7e0930 in execute_native_thread_routine () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
#14 0x00007fc5b1e771ca in start_thread () from /lib64/libpthread.so.0
#15 0x00007fc5b11a38d3 in clone () from /lib64/libc.so.6
Thread 15 (Thread 0x7fba3e9fc000 (LWP 3913234) "dataTransResp"):
#0 0x00007fc5b1e7d48c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007fc5b1b96870 in std::condition_variable::wait(std::unique_lock<std::mutex>&) () from /lib64/libstdc++.so.6
#2 0x00007fc614deca6b in tensorrt_llm::batch_manager::DataResponder::Impl::response() () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#3 0x00007fc614deac3d in std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result<void>,
std::__future_base::_Result_base::_Deleter>, std::thread::_Invoker<std::tuple<void (tensorrt_llm::batch_manager::DataResponder::Impl::*)(), tensorrt_llm::batch_manager::DataResponder::Impl*> >, void> >::_M_invoke(std::_Any_data const&) ()
from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#4 0x000000000043821b in std::__future_base::_State_baseV2::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*) ()
#5 0x00007fc5b1e7ee97 in __pthread_once_slow () from /lib64/libpthread.so.0
#6 0x00007fc614deba8d in std::thread::_State_impl<std::thread::_Invoker<std::tuple<std::__future_base::_Async_state_impl<std::thread::_Invoker<std::tuple<void (tensorrt_llm::batch_manager::DataResponder::Impl::*)(),
tensorrt_llm::batch_manager::DataResponder::Impl*> >, void>::_Async_state_impl(std::thread::_Invoker<std::tuple<void (tensorrt_llm::batch_manager::DataResponder::Impl::*)(), tensorrt_llm::batch_manager::DataResponder::Impl*>
>&&)::{lambda()#1}> > >::_M_run() () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#7 0x00007fc5fe7e0930 in execute_native_thread_routine () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
--Type <RET> for more, q to quit, c to continue without paging--
#8 0x00007fc5b1e771ca in start_thread () from /lib64/libpthread.so.0
#9 0x00007fc5b11a38d3 in clone () from /lib64/libc.so.6
Thread 14 (Thread 0x7fba3f8fe000 (LWP 3913230) "executor_server"):
#0 0x00007fc5b1e7d48c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007fc5b5938727 in ncclProxyProgress(void*) () from /lib64/libnccl.so.2
#2 0x00007fc5b1e771ca in start_thread () from /lib64/libpthread.so.0
#3 0x00007fc5b11a38d3 in clone () from /lib64/libc.so.6
Thread 13 (Thread 0x7fba6aba8000 (LWP 3913221) "executor_server"):
#0 0x00007fc5b129cac1 in poll () from /lib64/libc.so.6
#1 0x00007fc5b593747a in ncclProxyServiceUDS(void*) () from /lib64/libnccl.so.2
#2 0x00007fc5b1e771ca in start_thread () from /lib64/libpthread.so.0
#3 0x00007fc5b11a38d3 in clone () from /lib64/libc.so.6
Thread 12 (Thread 0x7fba6b3a9000 (LWP 3913219) "executor_server"):
#0 0x00007fc5b129cac1 in poll () from /lib64/libc.so.6
#1 0x00007fc5b5938dc2 in ncclProxyService(void*) () from /lib64/libnccl.so.2
#2 0x00007fc5b1e771ca in start_thread () from /lib64/libpthread.so.0
#3 0x00007fc5b11a38d3 in clone () from /lib64/libc.so.6
Thread 11 (Thread 0x7fbee6e04000 (LWP 3913215) "executor_server"):
#0 0x00007fc5b1e80ae4 in read () from /lib64/libpthread.so.0
#1 0x00007fc59f8f2fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007fc5b59690d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
#3 0x00007fc5b598bc04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
#4 0x00007fc5b1e771ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007fc5b11a38d3 in clone () from /lib64/libc.so.6
Thread 10 (Thread 0x7fbee7605000 (LWP 3913214) "executor_server"):
#0 0x00007fc5b1e80ae4 in read () from /lib64/libpthread.so.0
#1 0x00007fc59f8f2fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007fc5b59690d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
#3 0x00007fc5b598bc04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
#4 0x00007fc5b1e771ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007fc5b11a38d3 in clone () from /lib64/libc.so.6
Thread 9 (Thread 0x7fbee7e06000 (LWP 3913213) "executor_server"):
#0 0x00007fc5b1e80ae4 in read () from /lib64/libpthread.so.0
#1 0x00007fc59f8f2fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007fc5b59690d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
#3 0x00007fc5b598bc04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
#4 0x00007fc5b1e771ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007fc5b11a38d3 in clone () from /lib64/libc.so.6
Thread 8 (Thread 0x7fc388e83000 (LWP 3913212) "executor_server"):
#0 0x00007fc5b1e80ae4 in read () from /lib64/libpthread.so.0
#1 0x00007fc59f8f2fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007fc5b59690d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
#3 0x00007fc5b598bc04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
#4 0x00007fc5b1e771ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007fc5b11a38d3 in clone () from /lib64/libc.so.6
Thread 7 (Thread 0x7fc389684000 (LWP 3913211) "executor_server"):
#0 0x00007fc5b1e80ae4 in read () from /lib64/libpthread.so.0
#1 0x00007fc59f8f2fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007fc5b59690d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
#3 0x00007fc5b598bc04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
#4 0x00007fc5b1e771ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007fc5b11a38d3 in clone () from /lib64/libc.so.6
Thread 6 (Thread 0x7fc389e85000 (LWP 3913210) "executor_server"):
#0 0x00007fc5b1e80ae4 in read () from /lib64/libpthread.so.0
#1 0x00007fc59f8f2fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007fc5b59690d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
#3 0x00007fc5b598bc04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
--Type <RET> for more, q to quit, c to continue without paging--
#4 0x00007fc5b1e771ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007fc5b11a38d3 in clone () from /lib64/libc.so.6
Thread 5 (Thread 0x7fc38bf7b000 (LWP 3913175) "cuda-EvtHandlr"):
#0 0x00007fc5b129cac1 in poll () from /lib64/libc.so.6
#1 0x00007fc5b3958e4f in ?? () from /lib64/libcuda.so.1
#2 0x00007fc5b3a2c12f in ?? () from /lib64/libcuda.so.1
#3 0x00007fc5b39550f3 in ?? () from /lib64/libcuda.so.1
#4 0x00007fc5b1e771ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007fc5b11a38d3 in clone () from /lib64/libc.so.6
Thread 4 (Thread 0x7fc59b7c6000 (LWP 3913168) "cuda00006000019"):
#0 0x00007fc5b129cac1 in poll () from /lib64/libc.so.6
#1 0x00007fc5b3958e4f in ?? () from /lib64/libcuda.so.1
#2 0x00007fc5b3a2c12f in ?? () from /lib64/libcuda.so.1
#3 0x00007fc5b39550f3 in ?? () from /lib64/libcuda.so.1
#4 0x00007fc5b1e771ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007fc5b11a38d3 in clone () from /lib64/libc.so.6
Thread 3 (Thread 0x7fc5a75c7000 (LWP 3913157) "executor_server"):
#0 0x00007fc5b12a9307 in epoll_wait () from /lib64/libc.so.6
#1 0x00007fc5afcc0509 in ?? () from /lib64/libevent_core-2.1.so.6
#2 0x00007fc5afcb61e8 in event_base_loop () from /lib64/libevent_core-2.1.so.6
#3 0x00007fc5ad8ea276 in ?? () from /lib64/libpmix.so.2
#4 0x00007fc5b1e771ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007fc5b11a38d3 in clone () from /lib64/libc.so.6
Thread 2 (Thread 0x7fc5ae582000 (LWP 3913153) "executor_server"):
#0 0x00007fc5b129cac1 in poll () from /lib64/libc.so.6
#1 0x00007fc5afcbf8e9 in ?? () from /lib64/libevent_core-2.1.so.6
#2 0x00007fc5afcb61e8 in event_base_loop () from /lib64/libevent_core-2.1.so.6
#3 0x00007fc5b0356d96 in ?? () from /usr/lib64/openmpi/lib/libopen-pal.so.40
#4 0x00007fc5b1e771ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007fc5b11a38d3 in clone () from /lib64/libc.so.6
Thread 1 (Thread 0x7fc6357f4000 (LWP 3913151) "executor_server"):
#0 0x00007fc5b1e786cd in __pthread_timedjoin_ex () from /lib64/libpthread.so.0
#1 0x00007fc5b1b9cda7 in std::thread::join() () from /lib64/libstdc++.so.6
#2 0x00007fc614f0c009 in tensorrt_llm::executor::Executor::Impl::shutdown() () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#3 0x00007fc614f0c0f6 in tensorrt_llm::executor::Executor::Impl::~Impl() () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#4 0x00007fc614ef0f81 in tensorrt_llm::executor::Executor::~Executor() () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#5 0x000000000046247a in modules::executor_server::Executor::ExecutorImpl::~ExecutorImpl() ()
#6 0x000000000046e701 in modules::executor_server::Executor::~Executor() ()
#7 0x0000000000440381 in std::unique_ptr<modules::executor_server::Executor, std::default_delete<modules::executor_server::Executor> >::~unique_ptr() ()
#8 0x000000000042d4fc in main () cuda-gdb rank3 (cuda-gdb) info cuda kernels
No CUDA kernels.
(cuda-gdb) thread apply all bt
Thread 19 (Thread 0x7f0ec48d9000 (LWP 3268986) "executor_server"):
#0 0x00007f1a39775307 in epoll_wait () from /lib64/libc.so.6
#1 0x00007f0e2b015984 in ?? () from /lib64/libcudadebugger.so.1
#2 0x00007f0e2b016462 in ?? () from /lib64/libcudadebugger.so.1
#3 0x00007f0e2b014610 in ?? () from /lib64/libcudadebugger.so.1
#4 0x00007f0e2b01daba in ?? () from /lib64/libcudadebugger.so.1
#5 0x00007f0e2b01dfbc in ?? () from /lib64/libcudadebugger.so.1
#6 0x00007f0e2af03020 in ?? () from /lib64/libcudadebugger.so.1
#7 0x00007f0e2b05d997 in ?? () from /lib64/libcudadebugger.so.1
#8 0x00007f1a3a3431ca in start_thread () from /lib64/libpthread.so.0
#9 0x00007f1a3966f8d3 in clone () from /lib64/libc.so.6
Thread 17 (Thread 0x7f0ec60fb000 (LWP 3913347) "cuda-EvtHandlr"):
#0 0x00007f1a3a34948c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007f0e2b0164ba in ?? () from /lib64/libcudadebugger.so.1
#2 0x00007f0e2b014610 in ?? () from /lib64/libcudadebugger.so.1
#3 0x00007f0e2ac2b225 in ?? () from /lib64/libcudadebugger.so.1
#4 0x00007f0e2ac7f961 in ?? () from /lib64/libcudadebugger.so.1
#5 0x00007f0e2ac10a9f in ?? () from /lib64/libcudadebugger.so.1
#6 0x00007f0e2ad11bd6 in ?? () from /lib64/libcudadebugger.so.1
#7 0x00007f1a3c05b42b in ?? () from /lib64/libcuda.so.1
#8 0x00007f1a3bef8184 in ?? () from /lib64/libcuda.so.1
#9 0x00007f1a3be210f3 in ?? () from /lib64/libcuda.so.1
#10 0x00007f1a3a3431ca in start_thread () from /lib64/libpthread.so.0
#11 0x00007f1a3966f8d3 in clone () from /lib64/libc.so.6
Thread 16 (Thread 0x7f0ec68fc000 (LWP 3913239) "executionLoop"):
#0 0x00007f1a3966f41d in syscall () from /lib64/libc.so.6
#1 0x00007f0e2b05c730 in ?? () from /lib64/libcudadebugger.so.1
#2 0x00007f0e2abf5fd5 in ?? () from /lib64/libcudadebugger.so.1
#3 0x00007f0e2ad11bd6 in ?? () from /lib64/libcudadebugger.so.1
#4 0x00007f1a3bfe9d76 in ?? () from /lib64/libcuda.so.1
#5 0x00007f1ab88d0ac5 in cudaStreamQuery () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so
#6 0x00007f1ab884452b in (anonymous namespace)::ncclStreamSynchronize(CUstream_st*, ncclComm*) () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so
#7 0x00007f1ab884553c in tensorrt_llm::plugins::AllgatherPlugin::enqueue(nvinfer1::PluginTensorDesc const*, nvinfer1::PluginTensorDesc const*, void const* const*, void* const*, void*, CUstream_st*) () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so
#8 0x00007f1a4e040fec in ?? () from /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.10
#9 0x00007f1a4dfb1a55 in ?? () from /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.10
#10 0x00007f1a4dfb3609 in ?? () from /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.10
#11 0x00007f1a9cf433a4 in tensorrt_llm::runtime::TllmRuntime::executeContext(int) const () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#12 0x00007f1a9d32f08b in tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeContext(int, int) () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#13 0x00007f1a9d33e0b6 in tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeStep(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> >
> const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int) () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#14 0x00007f1a9d33e8de in tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeBatch(tensorrt_llm::batch_manager::ScheduledRequests const&) () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#15 0x00007f1a9d33f01b in tensorrt_llm::batch_manager::TrtGptModelInflightBatching::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>,
std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&) () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#16 0x00007f1a9d3d44a1 in tensorrt_llm::executor::Executor::Impl::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&) ()
from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#17 0x00007f1a9d3db5bc in tensorrt_llm::executor::Executor::Impl::executionLoop() () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#18 0x00007f1a86cac930 in execute_native_thread_routine () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
#19 0x00007f1a3a3431ca in start_thread () from /lib64/libpthread.so.0
#20 0x00007f1a3966f8d3 in clone () from /lib64/libc.so.6
Thread 15 (Thread 0x7f0ec70fd000 (LWP 3913236) "dataTransResp"):
#0 0x00007f1a3a34948c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007f1a3a062870 in std::condition_variable::wait(std::unique_lock<std::mutex>&) () from /lib64/libstdc++.so.6
#2 0x00007f1a9d2b8a6b in tensorrt_llm::batch_manager::DataResponder::Impl::response() () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#3 0x00007f1a9d2b6c3d in std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result<void>,
std::__future_base::_Result_base::_Deleter>, std::thread::_Invoker<std::tuple<void (tensorrt_llm::batch_manager::DataResponder::Impl::*)(), tensorrt_llm::batch_manager::DataResponder::Impl*> >, void> >::_M_invoke(std::_Any_data const&) ()
from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#4 0x000000000043821b in std::__future_base::_State_baseV2::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*) ()
#5 0x00007f1a3a34ae97 in __pthread_once_slow () from /lib64/libpthread.so.0
--Type <RET> for more, q to quit, c to continue without paging--
#6 0x00007f1a9d2b7a8d in std::thread::_State_impl<std::thread::_Invoker<std::tuple<std::__future_base::_Async_state_impl<std::thread::_Invoker<std::tuple<void (tensorrt_llm::batch_manager::DataResponder::Impl::*)(),
tensorrt_llm::batch_manager::DataResponder::Impl*> >, void>::_Async_state_impl(std::thread::_Invoker<std::tuple<void (tensorrt_llm::batch_manager::DataResponder::Impl::*)(), tensorrt_llm::batch_manager::DataResponder::Impl*>
>&&)::{lambda()#1}> > >::_M_run() () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#7 0x00007f1a86cac930 in execute_native_thread_routine () from
/home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
#8 0x00007f1a3a3431ca in start_thread () from /lib64/libpthread.so.0
#9 0x00007f1a3966f8d3 in clone () from /lib64/libc.so.6
Thread 14 (Thread 0x7f0ec7fff000 (LWP 3913231) "executor_server"):
#0 0x00007f1a3a34948c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007f1a3de04727 in ncclProxyProgress(void*) () from /lib64/libnccl.so.2
#2 0x00007f1a3a3431ca in start_thread () from /lib64/libpthread.so.0
#3 0x00007f1a3966f8d3 in clone () from /lib64/libc.so.6
Thread 13 (Thread 0x7f0ef33a9000 (LWP 3913224) "executor_server"):
#0 0x00007f1a39768ac1 in poll () from /lib64/libc.so.6
#1 0x00007f1a3de0347a in ncclProxyServiceUDS(void*) () from /lib64/libnccl.so.2
#2 0x00007f1a3a3431ca in start_thread () from /lib64/libpthread.so.0
#3 0x00007f1a3966f8d3 in clone () from /lib64/libc.so.6
Thread 12 (Thread 0x7f136ee04000 (LWP 3913223) "executor_server"):
#0 0x00007f1a39768ac1 in poll () from /lib64/libc.so.6
#1 0x00007f1a3de04dc2 in ncclProxyService(void*) () from /lib64/libnccl.so.2
#2 0x00007f1a3a3431ca in start_thread () from /lib64/libpthread.so.0
#3 0x00007f1a3966f8d3 in clone () from /lib64/libc.so.6
Thread 11 (Thread 0x7f136f605000 (LWP 3913201) "executor_server"):
#0 0x00007f1a3a34cae4 in read () from /lib64/libpthread.so.0
#1 0x00007f1a27df1fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f1a3de350d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
#3 0x00007f1a3de57c04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
#4 0x00007f1a3a3431ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007f1a3966f8d3 in clone () from /lib64/libc.so.6
Thread 10 (Thread 0x7f136fe06000 (LWP 3913198) "executor_server"):
#0 0x00007f1a3a34cae4 in read () from /lib64/libpthread.so.0
#1 0x00007f1a27df1fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f1a3de350d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
#3 0x00007f1a3de57c04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
#4 0x00007f1a3a3431ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007f1a3966f8d3 in clone () from /lib64/libc.so.6
Thread 9 (Thread 0x7f1810882000 (LWP 3913196) "executor_server"):
#0 0x00007f1a3a34cae4 in read () from /lib64/libpthread.so.0
#1 0x00007f1a27df1fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f1a3de350d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
#3 0x00007f1a3de57c04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
#4 0x00007f1a3a3431ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007f1a3966f8d3 in clone () from /lib64/libc.so.6
Thread 8 (Thread 0x7f1811083000 (LWP 3913194) "executor_server"):
#0 0x00007f1a3a34cae4 in read () from /lib64/libpthread.so.0
#1 0x00007f1a27df1fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f1a3de350d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
#3 0x00007f1a3de57c04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
#4 0x00007f1a3a3431ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007f1a3966f8d3 in clone () from /lib64/libc.so.6
Thread 7 (Thread 0x7f1811884000 (LWP 3913192) "executor_server"):
#0 0x00007f1a3a34cae4 in read () from /lib64/libpthread.so.0
#1 0x00007f1a27df1fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f1a3de350d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
#3 0x00007f1a3de57c04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
#4 0x00007f1a3a3431ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007f1a3966f8d3 in clone () from /lib64/libc.so.6
--Type <RET> for more, q to quit, c to continue without paging--
Thread 6 (Thread 0x7f1812085000 (LWP 3913186) "executor_server"):
#0 0x00007f1a3a34cae4 in read () from /lib64/libpthread.so.0
#1 0x00007f1a27df1fa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f1a3de350d2 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /lib64/libnccl.so.2
#3 0x00007f1a3de57c04 in ncclIbAsyncThreadMain(void*) () from /lib64/libnccl.so.2
#4 0x00007f1a3a3431ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007f1a3966f8d3 in clone () from /lib64/libc.so.6
Thread 5 (Thread 0x7f1a20db1000 (LWP 3913176) "cuda-EvtHandlr"):
#0 0x00007f1a39768ac1 in poll () from /lib64/libc.so.6
#1 0x00007f1a3be24e4f in ?? () from /lib64/libcuda.so.1
#2 0x00007f1a3bef812f in ?? () from /lib64/libcuda.so.1
#3 0x00007f1a3be210f3 in ?? () from /lib64/libcuda.so.1
#4 0x00007f1a3a3431ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007f1a3966f8d3 in clone () from /lib64/libc.so.6
Thread 4 (Thread 0x7f1a23ecd000 (LWP 3913169) "cuda00006000019"):
#0 0x00007f1a39768ac1 in poll () from /lib64/libc.so.6
#1 0x00007f1a3be24e4f in ?? () from /lib64/libcuda.so.1
#2 0x00007f1a3bef812f in ?? () from /lib64/libcuda.so.1
#3 0x00007f1a3be210f3 in ?? () from /lib64/libcuda.so.1
#4 0x00007f1a3a3431ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007f1a3966f8d3 in clone () from /lib64/libc.so.6
Thread 3 (Thread 0x7f1a2fbf0000 (LWP 3913158) "executor_server"):
#0 0x00007f1a39775307 in epoll_wait () from /lib64/libc.so.6
#1 0x00007f1a3818c509 in ?? () from /lib64/libevent_core-2.1.so.6
#2 0x00007f1a381821e8 in event_base_loop () from /lib64/libevent_core-2.1.so.6
#3 0x00007f1a35db6276 in ?? () from /lib64/libpmix.so.2
#4 0x00007f1a3a3431ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007f1a3966f8d3 in clone () from /lib64/libc.so.6
Thread 2 (Thread 0x7f1a36a4e000 (LWP 3913154) "executor_server"):
#0 0x00007f1a39768ac1 in poll () from /lib64/libc.so.6
#1 0x00007f1a3818b8e9 in ?? () from /lib64/libevent_core-2.1.so.6
#2 0x00007f1a381821e8 in event_base_loop () from /lib64/libevent_core-2.1.so.6
#3 0x00007f1a38822d96 in ?? () from /usr/lib64/openmpi/lib/libopen-pal.so.40
#4 0x00007f1a3a3431ca in start_thread () from /lib64/libpthread.so.0
#5 0x00007f1a3966f8d3 in clone () from /lib64/libc.so.6
Thread 1 (Thread 0x7f1abdcc0000 (LWP 3913152) "executor_server"):
#0 0x00007f1a3a3446cd in __pthread_timedjoin_ex () from /lib64/libpthread.so.0
#1 0x00007f1a3a068da7 in std::thread::join() () from /lib64/libstdc++.so.6
#2 0x00007f1a9d3d8009 in tensorrt_llm::executor::Executor::Impl::shutdown() () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#3 0x00007f1a9d3d80f6 in tensorrt_llm::executor::Executor::Impl::~Impl() () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#4 0x00007f1a9d3bcf81 in tensorrt_llm::executor::Executor::~Executor() () from /home/askhoroshev/trtllm_github/TensorRT-LLM/cpp/build/tensorrt_llm/libtensorrt_llm.so
#5 0x000000000046247a in modules::executor_server::Executor::ExecutorImpl::~ExecutorImpl() ()
#6 0x000000000046e701 in modules::executor_server::Executor::~Executor() ()
#7 0x0000000000440381 in std::unique_ptr<modules::executor_server::Executor, std::default_delete<modules::executor_server::Executor> >::~unique_ptr() ()
#8 0x000000000042d4fc in main () |
This problem was solved for me after setting FORCE_NCCL_ALL_REDUCE_STRATEGY=1 env. I don't know why custom all reduce affects all gather but the problem is gone. Probably custom all reduce implementation has UB which affects nccl. |
My version
tp4 fp8 deepseek like custom model. ExecutorApi (mpirun -n 4 command for start).
nccl version: 2.22.3
I use batched logit processor to control generation:
Main rank batch logit processor config (does logit modification)
Other ranks config (actually do nothing)
When running the stability test everything works fine for the first 20 hours, but after that time (or about 2 million requests) the server hangs.
cuda-gdb attach to MAIN pid to determine active kernel (other ranks do not have active kernels)
some meaningful backtraces
Is it related NVIDIA/nccl#311?
The text was updated successfully, but these errors were encountered: