Update TensorRT-LLM (#2532)

* blossom-ci.yml: run vulnerability scan on blossom * open source efb18c1256f8c9c3d47b7d0c740b83e5d5ebe0ec --------- Co-authored-by: niukuo <[email protected]> Co-authored-by: pei0033 <[email protected]> Co-authored-by: Kyungmin Lee <[email protected]> Co-authored-by: Kaiyu Xie <[email protected]>
NVIDIA · Dec 4, 2024 · 548b5b7 · 548b5b7
1 parent 4420547
commit 548b5b7
Show file tree

Hide file tree

Showing 762 changed files with 1,673,620 additions and 1,550,597 deletions.
diff --git a/.gitignore b/.gitignore
@@ -44,6 +44,7 @@ docs/source/llm-api-examples/llm_*.rst
 # Testing
 .coverage.*
 results_trt/
+llm-test-workspace/
 
 # build/debug
 *.safetensors

diff --git a/.gitmodules b/.gitmodules
@@ -17,3 +17,6 @@
 [submodule "3rdparty/pybind11"]
 	path = 3rdparty/pybind11
 	url = https://github.com/pybind/pybind11.git
+[submodule "3rdparty/xgrammar"]
+	path = 3rdparty/xgrammar
+	url = https://github.com/mlc-ai/xgrammar.git
diff --git a/3rdparty/xgrammar b/3rdparty/xgrammar
diff --git a/benchmarks/cpp/disaggServerBenchmark.cpp b/benchmarks/cpp/disaggServerBenchmark.cpp
@@ -812,8 +812,9 @@ class DisaggExecutorServer
             }
             if (mEnableCollectIterStats)
             {
-                for (auto const& iterStats : contextStats)
+                for (std::size_t i = 0; i < contextStats.size(); i++)
                 {
+                    auto const& iterStats = contextStats.at(i);
                     for (auto const& stat : iterStats)
                     {
                         SizeType32 numNewActiveRequests = stat.numNewActiveRequests;
@@ -826,13 +827,15 @@ class DisaggExecutorServer
                         }
                         if (mLogIterationData)
                         {
-                            TLLM_LOG_INFO(texec::JsonSerialization::toJsonStr(stat));
+                            TLLM_LOG_INFO(
+                                "ctx_id %d, ctx_stat: %s", i, texec::JsonSerialization::toJsonStr(stat).c_str());
                         }
                     }
                 }
 
-                for (auto const& iterStats : generationStats)
+                for (std::size_t i = 0; i < generationStats.size(); i++)
                 {
+                    auto const& iterStats = generationStats.at(i);
                     for (auto const& stat : iterStats)
                     {
                         SizeType32 numNewActiveRequests = stat.numNewActiveRequests;
@@ -845,7 +848,8 @@ class DisaggExecutorServer
                         }
                         if (mLogIterationData)
                         {
-                            TLLM_LOG_INFO(texec::JsonSerialization::toJsonStr(stat));
+                            TLLM_LOG_INFO(
+                                "gen_id %d, gen_stat: %s", i, texec::JsonSerialization::toJsonStr(stat).c_str());
                         }
                     }
                 }
@@ -854,9 +858,9 @@ class DisaggExecutorServer
             {
                 continue;
             }
-            for (auto const& stats : generationRequestStatsPerIteration)
+            for (std::size_t i = 0; i < generationRequestStatsPerIteration.size(); i++)
             {
-
+                auto const& stats = generationRequestStatsPerIteration.at(i);
                 for (auto const& stat : stats)
                 {
                     std::vector<float> kvCacheTransferMs;
@@ -874,7 +878,8 @@ class DisaggExecutorServer
                     }
                     if (mLogIterationData)
                     {
-                        TLLM_LOG_INFO(texec::JsonSerialization::toJsonStr(stat));
+                        TLLM_LOG_INFO(
+                            "gen_id %d, gen_req_stat: %s", i, texec::JsonSerialization::toJsonStr(stat).c_str());
                     }
                 }
             }
@@ -973,6 +978,7 @@ void benchmark(std::vector<std::filesystem::path> const& contextEngineDirs,
     if (worldRank == 0)
     {
         { // warmup
+            TLLM_LOG_INFO("Warmup start");
             std::vector<tensorrt_llm::executor::Request> contextRequests;
             contextRequests.reserve(warmUp);
             for (int i = 0; i < warmUp; ++i)
@@ -989,6 +995,7 @@ void benchmark(std::vector<std::filesystem::path> const& contextEngineDirs,
             disaggExecutor->waitForGenResponse(warmUp, true);
             auto const warmUpWaitSleep = std::chrono::milliseconds(50);
             std::this_thread::sleep_for(warmUpWaitSleep);
+            TLLM_LOG_INFO("Warmup done");
         }
 
         {